PRELIMINARY DATA PREPARATION

This section includes importing the data, creating of new variables and establishing the dataframes for the initial analysis

Import raw data

dengue_features_test <- read.csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_features_test.csv", header = TRUE, stringsAsFactors = FALSE)
dengue_features_train <- read.csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_features_train.csv", header = TRUE, stringsAsFactors = FALSE)
dengue_labels_train <- read.csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_labels_train.csv", header = TRUE, stringsAsFactors = FALSE)
submission_format <- read.csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/submission_format.csv", header = TRUE, stringsAsFactors = FALSE)

Convert week_start_date to date format

dengue_features_test$week_start_date <- as.Date(dengue_features_test$week_start_date, "%Y-%m-%d")
dengue_features_train$week_start_date <- as.Date(dengue_features_train$week_start_date, "%Y-%m-%d")

Convert city to factor

dengue_features_test$city <- as.factor(dengue_features_test$city)
dengue_features_train$city <- as.factor(dengue_features_train$city)

Rescale the variables so that it is all in Celcius and mm

dengue_features_train$reanalysis_dew_point_temp_k <- dengue_features_train$reanalysis_dew_point_temp_k - 273.15
dengue_features_test$reanalysis_dew_point_temp_k <- dengue_features_test$reanalysis_dew_point_temp_k - 273.15

dengue_features_train$reanalysis_air_temp_k <- dengue_features_train$reanalysis_air_temp_k - 273.15
dengue_features_test$reanalysis_air_temp_k <- dengue_features_test$reanalysis_air_temp_k - 273.15

dengue_features_train$reanalysis_max_air_temp_k <- dengue_features_train$reanalysis_max_air_temp_k - 273.15
dengue_features_test$reanalysis_max_air_temp_k <- dengue_features_test$reanalysis_max_air_temp_k - 273.15

dengue_features_train$reanalysis_min_air_temp_k <- dengue_features_train$reanalysis_min_air_temp_k - 273.15
dengue_features_test$reanalysis_min_air_temp_k <- dengue_features_test$reanalysis_min_air_temp_k - 273.15

dengue_features_train$reanalysis_avg_temp_k <- dengue_features_train$reanalysis_avg_temp_k - 273.15
dengue_features_test$reanalysis_avg_temp_k <- dengue_features_test$reanalysis_avg_temp_k - 273.15

#!!!tdtr does not appear to be in Kelvin
# dengue_features_train$reanalysis_tdtr_k <- dengue_features_train$reanalysis_tdtr_k - 273.15
# dengue_features_test$reanalysis_tdtr_k <- dengue_features_test$reanalysis_tdtr_k - 273.15

summary(dengue_features_train$reanalysis_dew_point_temp_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   16.49   20.97   22.49   22.10   23.31   25.30      10
summary(dengue_features_train$reanalysis_air_temp_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.49   24.51   25.50   25.55   26.68   29.05      10
summary(dengue_features_train$reanalysis_max_air_temp_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   24.65   27.85   29.25   30.28   32.35   40.85      10
summary(dengue_features_train$reanalysis_min_air_temp_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.75   20.75   23.05   22.57   24.75   26.75      10
summary(dengue_features_train$reanalysis_avg_temp_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.74   25.11   26.14   26.08   27.06   29.78      10
summary(dengue_features_train$reanalysis_tdtr_k)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.357   2.329   2.857   4.904   7.625  16.029      10

Merge test and train set without the total_cases

df <- rbind(dengue_features_train,dengue_features_test)

Divide the test into sj and iq separately

iq_features_test <- dengue_features_test[dengue_features_test$city == 'iq', ]
sj_features_test <- dengue_features_test[dengue_features_test$city == 'sj', ]

Divide training sets into sj and iq separately

iq_features_train <- dengue_features_train[dengue_features_train$city == 'iq', ]
sj_features_train <- dengue_features_train[dengue_features_train$city == 'sj', ]

Divide the labels sets into sj and iq separately

iq_labels_train <- dengue_labels_train[dengue_labels_train$city == 'iq', ]
sj_labels_train <- dengue_labels_train[dengue_labels_train$city == 'sj', ]

Merge test and training sets separately for each city without total_cases

sj <- rbind(sj_features_train,sj_features_test)
iq <- rbind(iq_features_train,iq_features_test)

Merge train and label sets to include total_cases

df_train_labels <- merge(dengue_features_train, dengue_labels_train, by=c("city","year","weekofyear"))

Merge test and training sets separately for each city including total_cases

sj_train_labels <- merge(sj_features_train, sj_labels_train, by=c("city","year","weekofyear"))
iq_train_labels <- merge(iq_features_train, iq_labels_train, by=c("city","year","weekofyear"))

INITIAL & EXPLORATORY ANALYSIS

In this section, we summary the value of the data frames (together and by city). We also create the following graphs

  1. Frequency histograms
  2. Bivariate analysis - line graphs for time analysis
  3. Bivariate analysis - scatterplot for total_cases by other variables
  4. Wilcoxon test for test of means between cities

Review summary stats for each city

library(psych)

df_test.summary <- psych::describe(dengue_features_test, IQR=TRUE, quant=c(.25,.75) )
#View(df_test.summary)

df_train.summary <- psych::describe(dengue_features_train, IQR=TRUE, quant=c(.25,.75) )
#View(df_train.summary)

sj_train.summary <- psych::describe(sj_train_labels, IQR=TRUE, quant=c(.25,.75) )
#View(sj_train.summary)

iq_train.summary <- psych::describe(iq_train_labels, IQR=TRUE, quant=c(.25,.75) )
#View(iq_train.summary)


df.summary <- psych::describe(df, IQR=TRUE, quant=c(.25,.75))
#View(df.summary)

sj.summary <- psych::describe(sj, IQR=TRUE, quant=c(.25,.75) )
#View(sj.summary)

iq.summary <- psych::describe(iq, IQR=TRUE, quant=c(.25,.75) )
#View(iq.summary)

# summary(dengue_features_test$week_start_date)
# summary(dengue_features_train$week_start_date)
# summary(iq_features_train$week_start_date)
# summary(iq_features_test$week_start_date)
# summary(sj_features_train$week_start_date)
# summary(sj_features_test$week_start_date)
 
rm(df_test.summary, df_train.summary, sj_train.summary, iq_train.summary, df.summary, sj.summary, iq.summary)

GRAPH: Frequency histogram of all variables in training set (both cities together)

These graphs only include data from the training set as it includes total cases. Climate data across both training and test sets are below.

#remove week_start_date for histogram

df_train_labels$week_start_date <- NULL

cnames <- colnames(df_train_labels) 
par(mfrow=c(2,2))
for (i in 4:ncol(df_train_labels)) {
 hist(df_train_labels[,i],
      breaks = 20,
      main = paste("Freq Histogram", cnames[i], sep = ": "),
      xlab = cnames[i])
}

rm(cnames, i)

GRAPH: Frequency histogram of all variables in training set for SJ

Same as above but only for SJ.

cnames <- colnames(df_train_labels) 
par(mfrow=c(2,2))
for (i in 4:ncol(df_train_labels)) {
 hist(df_train_labels[df_train_labels$city == "sj",i], 
      breaks = 20,
      xlab = cnames[i], 
      main = paste("Freq Histogram for SJ", cnames[i], sep = ": "))
}

rm(cnames, i)

GRAPH: Frequency histogram of all variables in training set for IQ

Same as above but only for IQ.

cnames <- colnames(df_train_labels) 
par(mfrow=c(2,2))
for (i in 4:(ncol(df_train_labels))) {
 hist(df_train_labels[df_train_labels$city == "iq",i],
      breaks = 20,
      xlab = cnames[i],
      main = paste("Freq Histogram for IQ", cnames[i], sep = ": "))
}

rm(cnames, i)

GRAPH: Climate variables by time for SJ

Includes all the data from test and training set by time for SJ therefore the total_cases in not included. Total_cases by time is done separately.

cnames <- colnames(sj) 
par(mfrow=c(2,2))
for (i in 5:(ncol(sj))) {
 plot(sj$week_start_date,sj[,i],
      type = "n",
      ylim = c(min(sj[,i],na.rm=TRUE), max(sj[,i],na.rm=TRUE)),
      ylab = cnames[i],
      main = paste("Time Analysis for SJ", cnames[i], sep = ": "))
 lines(sj$week_start_date,sj[,i])
}

rm(cnames, i)

GRAPH: Climate variables by time for IQ

Includes all the data from test and training set by time for I therefore the total_cases in not included. Total_cases by time is done separately.

cnames <- colnames(iq) 
par(mfrow=c(2,2))
for (i in 5:(ncol(iq))) {
 plot(iq$week_start_date,iq[,i],
      type = "n",
      ylim = c(min(iq[,i],na.rm=TRUE), max(iq[,i],na.rm=TRUE)),
      ylab = cnames[i],
      main = paste("Time Analysis for IQ", cnames[i], sep = ": "))
 lines(iq$week_start_date,iq[,i])
}

rm(cnames, i)

GRAPH: Climate variables by week for SJ

Includes all the data from test and training set by time for SJ therefore the total_cases in not included. Total_cases by time is done separately.

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
cnames <- colnames(sj) 
par(mfrow=c(2,2))
for (i in 5:(ncol(sj))) {
  gg1 <- ggplot(sj,
                aes(x=weekofyear, 
                    y = sj[,i], 
                    group = weekofyear)) +
    geom_boxplot() +
    scale_x_continuous(breaks=seq(1,52,1)) +
    ylab(cnames[i]) +
    ggtitle(paste(cnames[i])) 

    print(gg1)
  }
## Warning: Removed 234 rows containing non-finite values (stat_boxplot).

## Warning: Removed 60 rows containing non-finite values (stat_boxplot).

## Warning: Removed 20 rows containing non-finite values (stat_boxplot).

## Warning: Removed 20 rows containing non-finite values (stat_boxplot).

## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

rm(cnames, i, gg1)

GRAPH: Climate variables by week for IQ

Includes all the data from test and training set by time for I therefore the total_cases in not included. Total_cases by time is done separately.

library(ggplot2)

cnames <- colnames(iq) 
par(mfrow=c(2,2))
for (i in 5:(ncol(iq))) {
  gg1 <- ggplot(sj,
                aes(x=weekofyear, 
                    y = sj[,i], 
                    group = weekofyear)) +
    geom_boxplot() +
    scale_x_continuous(breaks=seq(1,52,1)) +
    ylab(cnames[i]) +
    ggtitle(paste(cnames[i])) 

    print(gg1)
  }
## Warning: Removed 234 rows containing non-finite values (stat_boxplot).

## Warning: Removed 60 rows containing non-finite values (stat_boxplot).

## Warning: Removed 20 rows containing non-finite values (stat_boxplot).

## Warning: Removed 20 rows containing non-finite values (stat_boxplot).

## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

rm(cnames, i, gg1)

GRAPH: Total_cases by time for SJ, IQ and together

Line graph of all data by total cases. This uses only the training set.

library(ggplot2)

df_train_labels <- merge(dengue_features_train, dengue_labels_train, by=c("city","year","weekofyear"))

par(mfcol=c(1,3))
# Dengue Cases both cities together
ggplot(data = df_train_labels, aes(x=week_start_date, y=total_cases)) +
       geom_bar(stat = "identity", fill = "purple") +
  labs(title = "Total Dengue Cases - both cities combined",
       subtitle = paste(min(df_train_labels$week_start_date),max(df_train_labels$week_start_date), sep = " to "),
       x = "Date", y = "Total dengue cases")

#Dengue Cases for San Jose
ggplot(data = df_train_labels[df_train_labels$city == "sj",], aes(x=week_start_date, y=total_cases)) +
       geom_bar(stat = "identity", fill = "blue") +
  labs(title = "Total Dengue Cases in San Jose",
       subtitle = paste(min(df_train_labels$week_start_date[df_train_labels$city == "sj"]),max(df_train_labels$week_start_date[df_train_labels$city == "sj"]), sep = " to "),
       x = "Date", y = "Total dengue cases")

# Dengue Cases for Iquitos
ggplot(data = df_train_labels[df_train_labels$city == "iq",], aes(x=week_start_date, y=total_cases)) +
       geom_bar(stat = "identity", fill = "green") +
  labs(title = "Total Dengue Cases in Iquitos",
       subtitle = paste(min(df_train_labels$week_start_date[df_train_labels$city == "iq"]),max(df_train_labels$week_start_date[df_train_labels$city == "iq"]), sep = " to "),
       x = "Date", y = "Total dengue cases")

GRAPH: Average Total_cases by week for SJ, IQ

Line graph of all data by total cases. This uses only the training set.

library(ggplot2)

gg1 <- ggplot(sj_train_labels,
                aes(x=weekofyear, 
                    y = total_cases, 
                    group = weekofyear)) +
    geom_boxplot() +
    scale_x_continuous(breaks=seq(1,52,1)) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red") +
    ylab("Total cases") +
    ggtitle(paste("Boxplot: Total cases by Week for SJ")) 

    print(gg1)

gg3 <- ggplot(data=sj_labels_train, aes(x=weekofyear, y=total_cases)) +
  geom_bar(stat="summary", fun.y = "mean") +
  ggtitle(paste("Bar graph: Average total cases by Week for SJ")) +
  scale_x_continuous(breaks = seq(1,52, 2))

print(gg3)

gg2 <- ggplot(iq_train_labels,
                aes(x=weekofyear, 
                    y = total_cases, 
                    group = weekofyear)) +
    geom_boxplot() +
    scale_x_continuous(breaks=seq(1,52,1)) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red") +
    ylab("Total cases") +
    ggtitle(paste("Boxplot: Total cases by Week for IQ")) 

    print(gg2)

gg4 <- ggplot(data=iq_labels_train, aes(x=weekofyear, y=total_cases)) +
  geom_bar(stat="summary", fun.y = "mean") +
  ggtitle(paste("Bar graph: Average total cases by Week for IQ")) +
  scale_x_continuous(breaks = seq(1,52, 2))

print(gg4)

    rm(gg1, gg2, gg3, gg4)

GRAPH: Total_cases by climate variables (both cities together)

Scatterplot using training set only.

cnames <- colnames(df_train_labels) 
par(mfrow=c(2,2))
for (i in 5:(ncol(df_train_labels)-1)) {
 plot(df_train_labels$total_cases,
      df_train_labels[,i], 
      cex = 0.5, 
      pch = 19,
      ylim = c(min(df_train_labels[,i],na.rm=TRUE), max(df_train_labels[,i],na.rm=TRUE)),
      main = paste("Total_cases by climate variables", cnames[i], sep = ": "),
      ylab = cnames[i])
 
}

rm(cnames, i)

GRAPH: Total_cases by climate variables for SJ

Same as above but for SJ

cnames <- colnames(df_train_labels) 
par(mfrow=c(2,2))
for (i in 5:(ncol(df_train_labels)-1)) {
 plot(df_train_labels$total_cases[df_train_labels$city == "sj"],
      df_train_labels[df_train_labels$city == "sj",i], 
      cex = 0.5, 
      pch = 19,
      ylim = c(min(df_train_labels[,i],na.rm=TRUE), max(df_train_labels[,i],na.rm=TRUE)),
      main = paste("Total_cases for SJ by climate variables", cnames[i], sep = ": "),
      ylab = cnames[i])
 
}

rm(cnames, i)

GRAPH: Total_cases by climate variables for IQ

Same as above but for IQ.

cnames <- colnames(df_train_labels) 
par(mfrow=c(2,2))
for (i in 5:(ncol(df_train_labels)-1)) {
 plot(df_train_labels$total_cases[df_train_labels$city == "iq"],
      df_train_labels[df_train_labels$city == "iq",i], 
      cex = 0.5, 
      pch = 19,
      ylim = c(min(df_train_labels[,i],na.rm=TRUE), max(df_train_labels[,i],na.rm=TRUE)),
      main = paste("Total_cases for IQ by climate variables", cnames[i], sep = ": "),
      ylab = cnames[i])
 
}

rm(cnames, i)

Compare the means between same variables in different cities

We can see that the same feature is significantly different in each city

cnames <- colnames(sj_train_labels)
for (i in 5:(ncol(sj_train_labels))){
  wilt <- wilcox.test(sj_train_labels[,i],iq_train_labels[,i])  
  print(cnames[i])
  print(wilt)
}
## [1] "ndvi_ne"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 21691, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "ndvi_nw"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 32596, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "ndvi_se"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 107990, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "ndvi_sw"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 78560, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "precipitation_amt_mm"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 118470, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_air_temp_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 369950, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_avg_temp_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 255790, p-value = 0.03716
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_dew_point_temp_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 208230, p-value = 3.071e-05
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_max_air_temp_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 4645.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_min_air_temp_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 474700, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_precip_amt_kg_per_m2"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 139740, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_relative_humidity_percent"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 62770, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_sat_precip_amt_mm"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 118470, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_specific_humidity_g_per_kg"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 192510, p-value = 4.502e-10
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "reanalysis_tdtr_k"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 22, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "station_avg_temp_c"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 183690, p-value = 1.887e-08
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "station_diur_temp_rng_c"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 6834, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "station_max_temp_c"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 59998, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "station_min_temp_c"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 361870, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "station_precip_mm"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 142000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## [1] "total_cases"
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sj_train_labels[, i] and iq_train_labels[, i]
## W = 401310, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
rm(cnames, i, wilt)

Compare similar variable values within the dataset

There are several variables which appear to be the same feature but taken from a different source. For example, station_precip_mm and precipitation_amt_mm and reanalysis_sat_precip_amt_mm all appear to be the same “Total Precipitation value” Only one should be kept if they are the same.

Difference in max air temp

“station_max_temp_c”" and “reanalysis_max_air_temp_k” (scaled to Celcius)

library(ggplot2)

#generate a difference in max temp variable
sj_train_labels$max_air_diff <- sj_train_labels$station_max_temp_c - sj_train_labels$reanalysis_max_air_temp_k

#barplot the difference by year
ggplot(sj_train_labels,aes(x=year, y=max_air_diff))+
  geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).

#box plot difference by year
ggplot(sj_train_labels, aes(x=year, y = max_air_diff, group = year)) +   geom_boxplot() 
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

#Add month to the dataframe
sj_train_labels$month <- as.POSIXlt(sj_train_labels$week_start_date)$mon +1

#box plot difference by month
ggplot(sj_train_labels, aes(x=month, y = max_air_diff, group = month)) +   geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

sj_train_labels$max_air_diff <- NULL
sj_train_labels$month <- NULL

Difference in min air temp

“station_min_temp_c”" and “reanalysis_min_air_temp_k” (scaled to Celcius)

library(ggplot2)

#generate a difference in max temp variable
sj_train_labels$min_air_diff <- sj_train_labels$station_min_temp_c - sj_train_labels$reanalysis_min_air_temp_k

#barplot the difference by year
ggplot(sj_train_labels,aes(x=year, y=min_air_diff))+
  geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).

#box plot difference by year
ggplot(sj_train_labels, aes(x=year, y = min_air_diff, group = year)) +   geom_boxplot() 
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

#Add month to the dataframe
sj_train_labels$month <- as.POSIXlt(sj_train_labels$week_start_date)$mon +1

#box plot difference by month
ggplot(sj_train_labels, aes(x=month, y = min_air_diff, group = month)) +   geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

sj_train_labels$min_air_diff <- NULL
sj_train_labels$month <- NULL

Difference in average air temp

“station_avg_temp_c”" and “reanalysis_avg_temp_k” (scaled to Celcius)

library(ggplot2)

#generate a difference in max temp variable
sj_train_labels$avg_air_diff <- sj_train_labels$station_avg_temp_c - sj_train_labels$reanalysis_avg_temp_k

#barplot the difference by year
ggplot(sj_train_labels,aes(x=year, y=avg_air_diff))+
  geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).

#box plot difference by year
ggplot(sj_train_labels, aes(x=year, y = avg_air_diff, group = year)) +   geom_boxplot() 
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

#Add month to the dataframe
sj_train_labels$month <- as.POSIXlt(sj_train_labels$week_start_date)$mon +1

#box plot difference by month
ggplot(sj_train_labels, aes(x=month, y = avg_air_diff, group = month)) +   geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

sj_train_labels$avg_air_diff <- NULL
sj_train_labels$month <- NULL

Difference in total precipitation

“station_precip_mm”, “precipitation_amt_mm”, “reanalysis_sat_precip_amt_mm”, “reanalysis_precip_amt_kg_per_m2”

library(ggplot2)

precip <- c("station_precip_mm", "precipitation_amt_mm", "reanalysis_sat_precip_amt_mm", "reanalysis_precip_amt_kg_per_m2")

#Add month to the dataframe
sj_train_labels$month <- as.POSIXlt(sj_train_labels$week_start_date)$mon +1



for (i in 1:3){
  par(mfrow=c(1,3))
  #generate the first variable in the list
  p1 <- precip[i]
  ind1 <- which(colnames(sj_train_labels)==p1)
  for (j in ((i+1):4)){
    #generate the next variable in the list
    p2 <- precip[j]
  ind2 <- which(colnames(sj_train_labels)==p2)
  #generate a difference variable 
   sj_train_labels$diff <- sj_train_labels[,ind1] - sj_train_labels[,ind2]
   
   #barplot the difference by year
   gg1 <-ggplot(sj_train_labels,
                 aes(x=year, y=diff))+
      geom_bar(stat = "identity", fill="steelblue") + 
      ggtitle(paste(p1, "&", p2))
    print(gg1)
    
    #box plot the difference by year
   gg2 <-ggplot(sj_train_labels,
                 aes(x=year, y=diff, group = year)) +
      geom_boxplot() + 
      ggtitle(paste(p1, "&", p2))
    print(gg2)
    
    #box plot difference by month
    gg3 <- ggplot(sj_train_labels, 
                  aes(x=month, y = diff, group = month)) +
      geom_boxplot() +
      scale_x_continuous(breaks=seq(1,12,1)) +
      ggtitle(paste(p1, "&", p2))
    print(gg3)
  }
}
## Warning: Removed 9 rows containing missing values (position_stack).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing missing values (position_stack).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 6 rows containing missing values (position_stack).

## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing missing values (position_stack).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing missing values (position_stack).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing missing values (position_stack).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

sj_train_labels$diff <- NULL
sj_train_labels$month <- NULL 

rm(gg1, gg2, gg3, i, ind1, ind2, j, p1, p2, precip)

Review of climate factors independently (SJ ONLY)

This section of the exploratory analysis will review the effects of the major components of climate affect dengue cases. A 5x cross validation decision tree algorithm will be used to review the MAE error by year.

Decision Tree with vegetation data

library(rpart)
library(plyr)

form <- "total_cases ~ ndvi_se + ndvi_sw + ndvi_ne + ndvi_nw"
folds <- split(sj_train_labels, 
               cut(sample(1:nrow(sj_train_labels)),10))

errs.c50 <- rep(NA, length(folds))

for (i in 1:length(folds)) {
 test <- ldply(folds[i], data.frame)
 train <- ldply(folds[-i], data.frame)
 tmp.model <- rpart(as.formula(form), train)
 tmp.predict <- predict(tmp.model, newdata=test)
 conf.mat <- table(test$total_cases, tmp.predict)
 errs.c50[i] <- 1 - sum(diag(conf.mat))/sum(conf.mat)
}

print(sprintf("average error using k-fold cross validation and  decision tree algorithm: %.3f percent", 100*mean(abs(errs.c50))))
## [1] "average error using k-fold cross validation and  decision tree algorithm: 97.973 percent"
rm(folds, test, tmp.model, train, conf.mat, errs.c50, form, i, tmp.predict)

Decision Tree with temperature data

library(rpart)
library(plyr)

form <- "total_cases ~ station_max_temp_c + station_min_temp_c  + station_avg_temp_c  + station_diur_temp_rng_c + reanalysis_dew_point_temp_k  + reanalysis_air_temp_k + reanalysis_max_air_temp_k + reanalysis_min_air_temp_k + reanalysis_avg_temp_k + reanalysis_tdtr_k"

folds <- split(sj_train_labels, 
               cut(sample(1:nrow(sj_train_labels)),10))
errs.c50 <- rep(NA, length(folds))

for (i in 1:length(folds)) {
 test <- ldply(folds[i], data.frame)
 train <- ldply(folds[-i], data.frame)
 tmp.model <- rpart(as.formula(form), train)
 tmp.predict <- predict(tmp.model, newdata=test)
 conf.mat <- table(test$total_cases, tmp.predict)
 errs.c50[i] <- 1 - sum(diag(conf.mat))/sum(conf.mat)
}

print(sprintf("average error using k-fold cross validation and  decision tree algorithm: %.3f percent", 100*mean(abs(errs.c50))))
## [1] "average error using k-fold cross validation and  decision tree algorithm: 97.113 percent"
rm(folds, test, tmp.model, train, conf.mat, errs.c50, form, i, tmp.predict)

Decision Tree with precipitation data

library(rpart)
library(plyr)

form <- "total_cases ~ station_precip_mm  + precipitation_amt_mm + reanalysis_sat_precip_amt_mm + reanalysis_precip_amt_kg_per_m2"

folds <- split(sj_train_labels, 
               cut(sample(1:nrow(sj_train_labels)),10))
errs.c50 <- rep(NA, length(folds))

for (i in 1:length(folds)) {
 test <- ldply(folds[i], data.frame)
 train <- ldply(folds[-i], data.frame)
 tmp.model <- rpart(as.formula(form), train)
 tmp.predict <- predict(tmp.model, newdata=test)
 conf.mat <- table(test$total_cases, tmp.predict)
 errs.c50[i] <- 1 - sum(diag(conf.mat))/sum(conf.mat)
}

print(sprintf("average error using k-fold cross validation and  decision tree algorithm: %.3f percent", 100*mean(abs(errs.c50))))
## [1] "average error using k-fold cross validation and  decision tree algorithm: 97.112 percent"
rm(folds, test, tmp.model, train, conf.mat, errs.c50, form, i, tmp.predict)

Decision Tree with humidity data

library(rpart)
library(plyr)

form <- "total_cases ~ reanalysis_relative_humidity_percent +   reanalysis_specific_humidity_g_per_kg"

folds <- split(sj_train_labels, 
               cut(sample(1:nrow(sj_train_labels)),10))
errs.c50 <- rep(NA, length(folds))

for (i in 1:length(folds)) {
 test <- ldply(folds[i], data.frame)
 train <- ldply(folds[-i], data.frame)
 tmp.model <- rpart(as.formula(form), train)
 tmp.predict <- predict(tmp.model, newdata=test)
 conf.mat <- table(test$total_cases, tmp.predict)
 errs.c50[i] <- 1 - sum(diag(conf.mat))/sum(conf.mat)
}

print(sprintf("average error using k-fold cross validation and  decision tree algorithm: %.3f percent", 100*mean(abs(errs.c50))))
## [1] "average error using k-fold cross validation and  decision tree algorithm: 98.077 percent"
rm(folds, test, tmp.model, train, conf.mat, errs.c50, form, i, tmp.predict)

ANALYSIS OF OUTLIERS

GRAPH: Boxplot of climate variables (test and train)

Boxplot includes test and training set - NA still included

library(ggplot2)
cnames <- colnames(df) 
for (i in 5:(ncol(df))) {
 p <- ggplot(df, aes(x=city, y = df[,i], fill = city)) + 
  geom_boxplot() +
   labs(title = "Boxplot of climate variables",
       subtitle = cnames[i],
       x = "City", y = cnames[i])
 print(p)
}
## Warning: Removed 237 rows containing non-finite values (stat_boxplot).

## Warning: Removed 63 rows containing non-finite values (stat_boxplot).

## Warning: Removed 23 rows containing non-finite values (stat_boxplot).

## Warning: Removed 23 rows containing non-finite values (stat_boxplot).

## Warning: Removed 15 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 15 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 55 rows containing non-finite values (stat_boxplot).

## Warning: Removed 55 rows containing non-finite values (stat_boxplot).

## Warning: Removed 23 rows containing non-finite values (stat_boxplot).

## Warning: Removed 23 rows containing non-finite values (stat_boxplot).

## Warning: Removed 27 rows containing non-finite values (stat_boxplot).

rm(cnames, i, p)

GRAPH: Boxplot of total cases

library(ggplot2)
ggplot(df_train_labels, aes(x=city, y = total_cases, fill = city)) + 
  geom_boxplot() +
   labs(title = "Boxplot of Total_cases",
       x = "City", y = "Total_cases")

DATAFRAME CLEANUP 1

Clean up all the extra dataframes produced during the exploratory analysis

rm(dengue_features_test,
   dengue_features_train,
   dengue_labels_train,
   sj_features_test,
   sj_features_train,
   sj_labels_train,
   iq_features_test,
   iq_features_train,
   iq_labels_train,
   df,
   iq,
   sj,
   df_train_labels,
   submission_format
    )

MISSING VALUES

Check for missing values (is.na)

In this section, we look at the number of missing values. Later we will do something about these missing values.

sj_train.na <- sapply(sj_train_labels, function(x) sum(is.na (x)))

iq_train.na <- sapply(iq_train_labels, function(x) sum(is.na (x)))

#df_train_labels.na <- sum(is.na(df_train_labels$total_cases))
# View(sj_train.na)
# View(iq_train.na)


#df_train_labels.na
rm(sj_train.na)
rm(iq_train.na)
#rm(df_train_labels.na)

Missing values: Remove all rows with an NA in it

sj_train_labels.naomit <- na.omit(sj_train_labels)
iq_train_labels.naomit <- na.omit(iq_train_labels)

Missing values: Using last non-NA value

library(zoo)
#library(tidyverse)
library(plyr)
sj_train_labels <- sj_train_labels[order(sj_train_labels$year, sj_train_labels$weekofyear),]
iq_train_labels <- iq_train_labels[order(iq_train_labels$year, iq_train_labels$weekofyear),]

sj_train_labels.lastna <- sj_train_labels 
iq_train_labels.lastna <-iq_train_labels 

sj_train_labels.lastna <- colwise(na.locf)(sj_train_labels.lastna)
iq_train_labels.lastna <- colwise(na.locf)(iq_train_labels.lastna)

#Issues using tidyverse as the locf function converts all values to character
# sj_train_labels.lastna <- sj_train_labels.lastna %>% do(na.locf(.))
# iq_train_labels.lastna <-iq_train_labels.lastna %>% do(na.locf(.))

sum(is.na(sj_train_labels.lastna))
sum(is.na(iq_train_labels.lastna))

REMOVE week_start_date AND city FROM DATASET

Removing the city and the week_start_date from the dataset wil allow for easier analysis

#keep a version with the start week included
sj_train_labels.startweek <- sj_train_labels.lastna
iq_train_labels.startweek <- iq_train_labels.lastna

#remove city
sj_train_labels.naomit$city <- NULL
sj_train_labels.lastna$city <- NULL
sj_train_labels.startweek$city <- NULL

iq_train_labels.naomit$city <- NULL
iq_train_labels.lastna$city <- NULL
iq_train_labels.startweek$city <- NULL

#remove week_start_date
sj_train_labels.naomit$week_start_date <- NULL
sj_train_labels.lastna$week_start_date <- NULL

iq_train_labels.naomit$week_start_date <- NULL
iq_train_labels.lastna$week_start_date <- NULL

CORRELATION ANALYSIS

In this section, we look at the correlation between the total_cases and the climate variables.

First we need to remove any of the non-numeric variables. The missing values are still in this first correlation analysis but this will be repeated with the missing values included.

Correlation analysis before missing values are addressed

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.4
## corrplot 0.84 loaded
#Correlation for SJ

sj_train_labels.cor <-data.frame(round(cor(sj_train_labels[,5:25],
                                           use = "complete.obs"),2))
#View(sj_train_labels.cor)         

corrplot(as.matrix(sj_train_labels.cor), 
         method = "number",
         type = "upper",
         title = "Corrplot for SJ")

#Correlation for IQ

iq_train_labels.cor <-data.frame(round(cor(iq_train_labels[,5:25],
                                           use = "complete.obs"),2))
#View(iq_train_labels.cor)         

plot.new()
corrplot(as.matrix(iq_train_labels.cor), 
         method = "number",
         type = "upper",
         title = "Corrplot for IQ")

rm(iq_train_labels.cor, sj_train_labels.cor)

Export correlation to CSV

# write.csv(df_train_labels.cor, file = "df_train_labels.cor.csv")
# write.csv(sj_train_labels.cor, file = "sj_train_labels.cor.csv")
# write.csv(iq_train_labels.cor, file = "iq_train_labels.cor.csv")

Comparison of correlations with the other non-na dataframes

Correlation with na.omit

library(corrplot)
sj_train_labels.cor.naomit <-data.frame(round(cor(sj_train_labels.naomit,
                                           use = "complete.obs"),2))
#View(sj_train_labels.cor.naomit)         

corrplot(as.matrix(sj_train_labels.cor.naomit), 
         method = "number",
         type = "upper")

iq_train_labels.cor.naomit <-data.frame(round(cor(iq_train_labels.naomit,
                                           use = "complete.obs"),2))
#View(iq_train_labels.cor.naomit)         

corrplot(as.matrix(iq_train_labels.cor.naomit), 
         method = "number",
         type = "upper")

rm(sj_train_labels.cor.naomit)
rm(iq_train_labels.cor.naomit)

Correlation with Last NA

library(corrplot)
sj_train_labels.cor.lastna <-data.frame(round(cor(sj_train_labels.lastna,
                                           use = "complete.obs"),2))
#View(sj_train_labels.cor.lastna)         

corrplot(as.matrix(sj_train_labels.cor.lastna), 
         method = "number",
         type = "upper")

iq_train_labels.cor.lastna <-data.frame(round(cor(iq_train_labels.lastna,
                                           use = "complete.obs"),2))
#SView(iq_train_labels.cor.lastna)         

corrplot(as.matrix(iq_train_labels.cor.lastna), 
         type = "upper", 
         tl.pos = "td",
         method = "number", 
         tl.cex = 0.75, 
         tl.col = 'black',
         order = "hclust",
         number.cex= 7/ncol(iq_train_labels.lastna),
         diag = FALSE)

rm(iq_train_labels.cor.lastna, sj_train_labels.cor.lastna)

Export non-na correlation to CSV

# write.csv(sj_train_labels.cor.naomit, file = "sj_train_labels.cor.naomit.csv")
# write.csv(iq_train_labels.cor.naomit, file = "iq_train_labels.cor.naomit.csv")
# 

# write.csv(sj_train_labels.cor.lastna, file = "sj_train_labels.cor.lastna.csv")
# write.csv(iq_train_labels.cor.lastna, file = "iq_train_labels.cor.lastna.csv")

Conclusion about correlation USE LASTNA

Different methods of imputing missing values had no impact on correlation. Will stick with last.na as the final version.

Remove dataframes which will no longer be used

rm(sj_train_labels.naomit, iq_train_labels.naomit)

FEATURE SELECTION and DIMENTIONALITY REDUCTION

We will use various methods to see if we can find any features that need to be eliminated

Feature selection via CaretR (Remove redundant features)

CaretR (Remove redundant features) for SJ

library(mlbench)
## Warning: package 'mlbench' was built under R version 3.4.4
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
# calculate correlation matrix
CorrelationMatrix <- cor(sj_train_labels.lastna)
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(CorrelationMatrix, cutoff=0.75)
# print indexes of highly correlated attributes
print(highlyCorrelated)
## [1] 16 10  8  9 18 11 12  7  5
cnames <- colnames(sj_train_labels.lastna)
for (i in list(highlyCorrelated)){
  print(cnames[i])
}
## [1] "reanalysis_specific_humidity_g_per_kg"
## [2] "reanalysis_dew_point_temp_k"          
## [3] "reanalysis_air_temp_k"                
## [4] "reanalysis_avg_temp_k"                
## [5] "station_avg_temp_c"                   
## [6] "reanalysis_max_air_temp_k"            
## [7] "reanalysis_min_air_temp_k"            
## [8] "precipitation_amt_mm"                 
## [9] "ndvi_se"
rm(CorrelationMatrix, cnames, highlyCorrelated, i)

CaretR (Remove redundant features) for IQ

library(mlbench)
library(caret)
# calculate correlation matrix
CorrelationMatrix <- cor(iq_train_labels.lastna)
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(CorrelationMatrix, cutoff=0.75)
# print indexes of highly correlated attributes
print(highlyCorrelated)
## [1] 17 11 16 10  9  7  5  3  4
cnames <- colnames(iq_train_labels.lastna)
for (i in list(highlyCorrelated)){
  print(cnames[i])
}
## [1] "reanalysis_tdtr_k"                    
## [2] "reanalysis_max_air_temp_k"            
## [3] "reanalysis_specific_humidity_g_per_kg"
## [4] "reanalysis_dew_point_temp_k"          
## [5] "reanalysis_avg_temp_k"                
## [6] "precipitation_amt_mm"                 
## [7] "ndvi_se"                              
## [8] "ndvi_ne"                              
## [9] "ndvi_nw"
rm(CorrelationMatrix, cnames, highlyCorrelated, i)

Feature selection via CaretR (via RFE)

CaretR (via RFE) for SJ

library(mlbench)
library(caret)
# define the control using a random forest selection function
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
# run the RFE algorithm for SJ
sj_rfe_results <- rfe(sj_train_labels.lastna[,3:23], sj_train_labels.lastna$total_cases, sizes=c(3:23), rfeControl=control)
# summarize the results
print(sj_rfe_results)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared   MAE RMSESD RsquaredSD  MAESD Selected
##          3  9.681   0.9680 3.684  5.092    0.03069 1.0893         
##          4 12.558   0.9506 5.822  5.287    0.04434 1.3020         
##          5 14.865   0.9352 7.815  5.114    0.05084 1.3636         
##          6  9.546   0.9710 4.067  4.250    0.02190 0.9598         
##          7 10.545   0.9698 5.009  4.154    0.01747 0.8538         
##          8 12.113   0.9624 6.169  4.326    0.02403 1.0039         
##          9  9.230   0.9769 4.107  4.114    0.01536 0.9095        *
##         10 10.485   0.9714 4.946  4.436    0.01720 1.0601         
##         11 11.594   0.9674 5.689  4.410    0.01741 1.0692         
##         12  9.271   0.9777 4.165  4.001    0.01326 0.9115         
##         13 10.111   0.9757 4.811  4.042    0.01364 0.9933         
##         14 11.075   0.9720 5.355  4.347    0.01667 0.9896         
##         15  9.465   0.9766 4.295  4.220    0.01530 0.9034         
##         16 10.638   0.9732 4.846  4.445    0.01570 1.0422         
##         17 11.066   0.9720 5.328  4.477    0.01431 1.0534         
##         18  9.718   0.9773 4.338  4.203    0.01211 0.9156         
##         19 10.446   0.9737 4.820  4.190    0.01481 0.8921         
##         20 11.218   0.9722 5.196  4.782    0.01323 1.0548         
##         21  9.755   0.9773 4.323  4.396    0.01117 1.0104         
## 
## The top 5 variables (out of 9):
##    total_cases, ndvi_se, ndvi_sw, reanalysis_specific_humidity_g_per_kg, ndvi_nw
# list the chosen features
predictors(sj_rfe_results)
## [1] "total_cases"                          
## [2] "ndvi_se"                              
## [3] "ndvi_sw"                              
## [4] "reanalysis_specific_humidity_g_per_kg"
## [5] "ndvi_nw"                              
## [6] "reanalysis_dew_point_temp_k"          
## [7] "ndvi_ne"                              
## [8] "reanalysis_precip_amt_kg_per_m2"      
## [9] "reanalysis_tdtr_k"
# plot the results
plot(sj_rfe_results, type=c("g", "o"), main = "RFE plot for SJ")

rm(sj_rfe_results, control)

CaretR (via RFE) for IQ

library(mlbench)
library(caret)

# define the control using a random forest selection function
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
# run the RFE algorithm for IQ
iq_rfe_results <- rfe(iq_train_labels.lastna, iq_train_labels.lastna$total_cases, sizes=c(3:23), rfeControl=control)
# summarize the results
print(iq_rfe_results)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD  MAESD Selected
##          3 2.589   0.9661 1.051  2.299    0.02912 0.4917        *
##          4 3.397   0.9458 1.554  2.611    0.03555 0.5639         
##          5 4.025   0.9234 1.971  2.694    0.03823 0.6190         
##          6 2.988   0.9499 1.147  2.730    0.04625 0.5304         
##          7 3.377   0.9409 1.416  2.843    0.04864 0.5655         
##          8 3.691   0.9287 1.663  3.036    0.05770 0.6507         
##          9 3.142   0.9422 1.203  2.982    0.05878 0.5789         
##         10 3.401   0.9355 1.394  3.057    0.05896 0.6049         
##         11 3.610   0.9277 1.541  3.141    0.06659 0.6535         
##         12 3.211   0.9390 1.225  3.069    0.06600 0.5827         
##         13 3.424   0.9319 1.371  3.057    0.06752 0.5859         
##         14 3.592   0.9250 1.495  3.160    0.07359 0.6175         
##         15 3.288   0.9332 1.282  3.088    0.06966 0.5768         
##         16 3.455   0.9283 1.374  3.148    0.07302 0.5879         
##         17 3.593   0.9260 1.501  3.229    0.07503 0.6474         
##         18 3.308   0.9323 1.284  3.207    0.07764 0.6014         
##         19 3.437   0.9312 1.395  3.217    0.07764 0.6183         
##         20 3.562   0.9235 1.445  3.231    0.07958 0.6064         
##         21 3.359   0.9300 1.328  3.185    0.07615 0.6103         
##         22 3.476   0.9265 1.421  3.246    0.07899 0.6244         
##         23 3.637   0.9220 1.504  3.171    0.07339 0.6025         
## 
## The top 3 variables (out of 3):
##    total_cases, year, station_avg_temp_c
# list the chosen features
 predictors(iq_rfe_results)
## [1] "total_cases"        "year"               "station_avg_temp_c"
# plot the results
plot(iq_rfe_results, type=c("g", "o"), main = "RFE plot for IQ")

rm(iq_rfe_results, control)

Feature selection using importance

Importance for SJ

# ensure results are repeatable
set.seed(136)

# load the library
library(mlbench)
library(caret)

# prepare training scheme
control <- trainControl(method="repeatedcv", number=11, repeats=1)

# train the model
model <- train(total_cases~., data=sj_train_labels.lastna[,3:23], method="cforest", preProcess="scale", trControl=control)

# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## cforest variable importance
## 
##                                        Overall
## ndvi_se                               2707.068
## ndvi_sw                               1264.511
## reanalysis_specific_humidity_g_per_kg  524.414
## reanalysis_max_air_temp_k              232.114
## reanalysis_tdtr_k                       68.758
## station_avg_temp_c                      47.346
## station_max_temp_c                      38.592
## reanalysis_precip_amt_kg_per_m2         35.714
## ndvi_nw                                 31.800
## reanalysis_relative_humidity_percent    26.746
## station_min_temp_c                      25.271
## ndvi_ne                                 22.196
## reanalysis_dew_point_temp_k             17.165
## station_diur_temp_rng_c                 14.770
## reanalysis_min_air_temp_k               11.496
## precipitation_amt_mm                    10.432
## station_precip_mm                        6.900
## reanalysis_air_temp_k                    4.880
## reanalysis_avg_temp_k                    1.967
## reanalysis_sat_precip_amt_mm             0.000
# plot importance
plot(importance)

model$finalModel
## 
##   Random Forest using Conditional Inference Trees
## 
## Number of trees:  500 
## 
## Response:  .outcome 
## Inputs:  ndvi_ne, ndvi_nw, ndvi_se, ndvi_sw, precipitation_amt_mm, reanalysis_air_temp_k, reanalysis_avg_temp_k, reanalysis_dew_point_temp_k, reanalysis_max_air_temp_k, reanalysis_min_air_temp_k, reanalysis_precip_amt_kg_per_m2, reanalysis_relative_humidity_percent, reanalysis_sat_precip_amt_mm, reanalysis_specific_humidity_g_per_kg, reanalysis_tdtr_k, station_avg_temp_c, station_diur_temp_rng_c, station_max_temp_c, station_min_temp_c, station_precip_mm 
## Number of observations:  936
rm(model, importance, control, GCtorture)

Importance for IQ

# ensure results are repeatable
set.seed(136)

# load the library
library(mlbench)
library(caret)

# prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)

# train the model
model <- train(total_cases~., data=iq_train_labels.lastna[,3:23], method="cforest", preProcess="scale", trControl=control)

# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## cforest variable importance
## 
##                                         Overall
## reanalysis_specific_humidity_g_per_kg  3.318451
## reanalysis_dew_point_temp_k            2.940082
## reanalysis_min_air_temp_k              1.674939
## reanalysis_tdtr_k                      1.378423
## reanalysis_precip_amt_kg_per_m2        1.285543
## station_min_temp_c                     1.212590
## reanalysis_relative_humidity_percent   1.087495
## reanalysis_avg_temp_k                  1.078973
## reanalysis_air_temp_k                  0.908769
## station_avg_temp_c                     0.842579
## station_max_temp_c                     0.559030
## station_diur_temp_rng_c                0.129754
## reanalysis_sat_precip_amt_mm           0.115857
## station_precip_mm                      0.047493
## ndvi_nw                                0.012152
## ndvi_sw                                0.007328
## ndvi_se                               -0.037845
## reanalysis_max_air_temp_k             -0.095271
## precipitation_amt_mm                  -0.168698
## ndvi_ne                               -0.267532
# plot importance
plot(importance)

rm(model, importance, control, GCtorture)
## Warning in rm(model, importance, control, GCtorture): object 'GCtorture'
## not found

Feature selection via Boruta

Boruta for SJ

library(Boruta)
## Warning: package 'Boruta' was built under R version 3.4.4
## Loading required package: ranger
## Warning: package 'ranger' was built under R version 3.4.4
sj_train_labels.boruta <- Boruta(sj_train_labels.lastna$total_cases~., data = sj_train_labels.lastna, doTrace = 2)
##  1. run of importance source...
##  2. run of importance source...
##  3. run of importance source...
##  4. run of importance source...
##  5. run of importance source...
##  6. run of importance source...
##  7. run of importance source...
##  8. run of importance source...
##  9. run of importance source...
##  10. run of importance source...
##  11. run of importance source...
##  12. run of importance source...
## After 12 iterations, +29 secs:
##  confirmed 14 attributes: ndvi_ne, ndvi_nw, ndvi_se, ndvi_sw, reanalysis_avg_temp_k and 9 more;
##  still have 8 attributes left.
##  13. run of importance source...
##  14. run of importance source...
##  15. run of importance source...
##  16. run of importance source...
## After 16 iterations, +38 secs:
##  confirmed 2 attributes: reanalysis_air_temp_k, station_min_temp_c;
##  still have 6 attributes left.
##  17. run of importance source...
##  18. run of importance source...
##  19. run of importance source...
## After 19 iterations, +45 secs:
##  confirmed 1 attribute: station_max_temp_c;
##  still have 5 attributes left.
##  20. run of importance source...
##  21. run of importance source...
##  22. run of importance source...
##  23. run of importance source...
##  24. run of importance source...
##  25. run of importance source...
##  26. run of importance source...
##  27. run of importance source...
##  28. run of importance source...
##  29. run of importance source...
##  30. run of importance source...
##  31. run of importance source...
##  32. run of importance source...
##  33. run of importance source...
##  34. run of importance source...
##  35. run of importance source...
##  36. run of importance source...
##  37. run of importance source...
##  38. run of importance source...
##  39. run of importance source...
##  40. run of importance source...
##  41. run of importance source...
##  42. run of importance source...
##  43. run of importance source...
##  44. run of importance source...
##  45. run of importance source...
##  46. run of importance source...
##  47. run of importance source...
##  48. run of importance source...
##  49. run of importance source...
##  50. run of importance source...
##  51. run of importance source...
##  52. run of importance source...
##  53. run of importance source...
##  54. run of importance source...
##  55. run of importance source...
##  56. run of importance source...
## After 56 iterations, +2.1 mins:
##  confirmed 1 attribute: station_precip_mm;
##  still have 4 attributes left.
##  57. run of importance source...
##  58. run of importance source...
##  59. run of importance source...
##  60. run of importance source...
##  61. run of importance source...
##  62. run of importance source...
##  63. run of importance source...
##  64. run of importance source...
##  65. run of importance source...
##  66. run of importance source...
##  67. run of importance source...
##  68. run of importance source...
##  69. run of importance source...
##  70. run of importance source...
##  71. run of importance source...
##  72. run of importance source...
##  73. run of importance source...
##  74. run of importance source...
##  75. run of importance source...
##  76. run of importance source...
##  77. run of importance source...
##  78. run of importance source...
## After 78 iterations, +3 mins:
##  confirmed 1 attribute: reanalysis_sat_precip_amt_mm;
##  still have 3 attributes left.
##  79. run of importance source...
##  80. run of importance source...
##  81. run of importance source...
##  82. run of importance source...
##  83. run of importance source...
##  84. run of importance source...
##  85. run of importance source...
##  86. run of importance source...
##  87. run of importance source...
##  88. run of importance source...
## After 88 iterations, +3.3 mins:
##  confirmed 1 attribute: precipitation_amt_mm;
##  still have 2 attributes left.
##  89. run of importance source...
##  90. run of importance source...
##  91. run of importance source...
## After 91 iterations, +3.4 mins:
##  confirmed 1 attribute: station_diur_temp_rng_c;
##  still have 1 attribute left.
##  92. run of importance source...
##  93. run of importance source...
##  94. run of importance source...
##  95. run of importance source...
##  96. run of importance source...
##  97. run of importance source...
##  98. run of importance source...
##  99. run of importance source...
print(sj_train_labels.boruta)
## Boruta performed 99 iterations in 3.745824 mins.
##  21 attributes confirmed important: ndvi_ne, ndvi_nw, ndvi_se,
## ndvi_sw, precipitation_amt_mm and 16 more;
##  No attributes deemed unimportant.
##  1 tentative attributes left: reanalysis_tdtr_k;
#Fix and tentative attributes
sj_train_labels.boruta  <- TentativeRoughFix(sj_train_labels.boruta)
print(sj_train_labels.boruta)
## Boruta performed 99 iterations in 3.745824 mins.
## Tentatives roughfixed over the last 99 iterations.
##  22 attributes confirmed important: ndvi_ne, ndvi_nw, ndvi_se,
## ndvi_sw, precipitation_amt_mm and 17 more;
##  No attributes deemed unimportant.
#Boruta plot for SJ
plot(sj_train_labels.boruta, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(sj_train_labels.boruta$ImpHistory),function(i)
sj_train_labels.boruta$ImpHistory[is.finite(sj_train_labels.boruta$ImpHistory[,i]),i])
names(lz) <- colnames(sj_train_labels.boruta$ImpHistory)
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(sj_train_labels.boruta$ImpHistory), cex.axis = 0.7)

rm(lz, Labels, sj_train_labels.boruta)

Boruta for IQ

library(Boruta)
iq_train_labels.boruta <- Boruta(iq_train_labels.lastna$total_cases~., data = iq_train_labels.lastna, doTrace = 2)
##  1. run of importance source...
##  2. run of importance source...
##  3. run of importance source...
##  4. run of importance source...
##  5. run of importance source...
##  6. run of importance source...
##  7. run of importance source...
##  8. run of importance source...
##  9. run of importance source...
##  10. run of importance source...
##  11. run of importance source...
##  12. run of importance source...
## After 12 iterations, +15 secs:
##  confirmed 7 attributes: reanalysis_dew_point_temp_k, reanalysis_min_air_temp_k, reanalysis_precip_amt_kg_per_m2, reanalysis_specific_humidity_g_per_kg, station_avg_temp_c and 2 more;
##  still have 15 attributes left.
##  13. run of importance source...
##  14. run of importance source...
##  15. run of importance source...
##  16. run of importance source...
## After 16 iterations, +19 secs:
##  confirmed 3 attributes: reanalysis_air_temp_k, reanalysis_relative_humidity_percent, station_max_temp_c;
##  rejected 2 attributes: ndvi_nw, ndvi_sw;
##  still have 10 attributes left.
##  17. run of importance source...
##  18. run of importance source...
##  19. run of importance source...
## After 19 iterations, +22 secs:
##  confirmed 1 attribute: reanalysis_avg_temp_k;
##  still have 9 attributes left.
##  20. run of importance source...
##  21. run of importance source...
##  22. run of importance source...
## After 22 iterations, +25 secs:
##  rejected 1 attribute: station_precip_mm;
##  still have 8 attributes left.
##  23. run of importance source...
##  24. run of importance source...
##  25. run of importance source...
##  26. run of importance source...
## After 26 iterations, +29 secs:
##  rejected 1 attribute: ndvi_ne;
##  still have 7 attributes left.
##  27. run of importance source...
##  28. run of importance source...
##  29. run of importance source...
## After 29 iterations, +32 secs:
##  confirmed 1 attribute: reanalysis_tdtr_k;
##  still have 6 attributes left.
##  30. run of importance source...
##  31. run of importance source...
##  32. run of importance source...
##  33. run of importance source...
##  34. run of importance source...
##  35. run of importance source...
##  36. run of importance source...
##  37. run of importance source...
##  38. run of importance source...
##  39. run of importance source...
##  40. run of importance source...
##  41. run of importance source...
##  42. run of importance source...
##  43. run of importance source...
##  44. run of importance source...
##  45. run of importance source...
##  46. run of importance source...
##  47. run of importance source...
##  48. run of importance source...
##  49. run of importance source...
##  50. run of importance source...
##  51. run of importance source...
##  52. run of importance source...
##  53. run of importance source...
##  54. run of importance source...
##  55. run of importance source...
##  56. run of importance source...
##  57. run of importance source...
##  58. run of importance source...
##  59. run of importance source...
##  60. run of importance source...
##  61. run of importance source...
##  62. run of importance source...
##  63. run of importance source...
##  64. run of importance source...
##  65. run of importance source...
##  66. run of importance source...
##  67. run of importance source...
##  68. run of importance source...
##  69. run of importance source...
##  70. run of importance source...
##  71. run of importance source...
##  72. run of importance source...
##  73. run of importance source...
##  74. run of importance source...
##  75. run of importance source...
##  76. run of importance source...
##  77. run of importance source...
##  78. run of importance source...
##  79. run of importance source...
##  80. run of importance source...
##  81. run of importance source...
## After 81 iterations, +1.4 mins:
##  rejected 1 attribute: ndvi_se;
##  still have 5 attributes left.
##  82. run of importance source...
##  83. run of importance source...
##  84. run of importance source...
##  85. run of importance source...
##  86. run of importance source...
##  87. run of importance source...
##  88. run of importance source...
##  89. run of importance source...
##  90. run of importance source...
##  91. run of importance source...
##  92. run of importance source...
##  93. run of importance source...
##  94. run of importance source...
##  95. run of importance source...
##  96. run of importance source...
##  97. run of importance source...
##  98. run of importance source...
##  99. run of importance source...
print(iq_train_labels.boruta)
## Boruta performed 99 iterations in 1.640086 mins.
##  12 attributes confirmed important: reanalysis_air_temp_k,
## reanalysis_avg_temp_k, reanalysis_dew_point_temp_k,
## reanalysis_min_air_temp_k, reanalysis_precip_amt_kg_per_m2 and 7
## more;
##  5 attributes confirmed unimportant: ndvi_ne, ndvi_nw, ndvi_se,
## ndvi_sw, station_precip_mm;
##  5 tentative attributes left: precipitation_amt_mm,
## reanalysis_max_air_temp_k, reanalysis_sat_precip_amt_mm,
## station_diur_temp_rng_c, station_min_temp_c;
#Fix and tentative attributes
iq_train_labels.boruta  <- TentativeRoughFix(iq_train_labels.boruta)
print(iq_train_labels.boruta)
## Boruta performed 99 iterations in 1.640086 mins.
## Tentatives roughfixed over the last 99 iterations.
##  14 attributes confirmed important: precipitation_amt_mm,
## reanalysis_air_temp_k, reanalysis_avg_temp_k,
## reanalysis_dew_point_temp_k, reanalysis_min_air_temp_k and 9 more;
##  8 attributes confirmed unimportant: ndvi_ne, ndvi_nw, ndvi_se,
## ndvi_sw, reanalysis_max_air_temp_k and 3 more;
#Boruta plot for IQ

plot(iq_train_labels.boruta, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(iq_train_labels.boruta$ImpHistory),function(i)
iq_train_labels.boruta$ImpHistory[is.finite(iq_train_labels.boruta$ImpHistory[,i]),i])
names(lz) <- colnames(iq_train_labels.boruta$ImpHistory)
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(iq_train_labels.boruta$ImpHistory), cex.axis = 0.7)

rm(lz, Labels, iq_train_labels.boruta)

Feature Selection using Random Forest

Random Forest for SJ

library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.4
library(ggplot2)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ranger':
## 
##     importance
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:psych':
## 
##     outlier
library(caret)

#Fit a model
 
model_sj.rf <- randomForest(sj_train_labels.startweek$total_cases ~ 
          ndvi_ne +
            ndvi_nw +
            ndvi_se +
            ndvi_sw +
            precipitation_amt_mm +
            reanalysis_air_temp_k +
            reanalysis_avg_temp_k + 
            reanalysis_dew_point_temp_k +
            reanalysis_max_air_temp_k +
            reanalysis_min_air_temp_k +
            reanalysis_precip_amt_kg_per_m2 +
            reanalysis_relative_humidity_percent +
            reanalysis_sat_precip_amt_mm +
            reanalysis_specific_humidity_g_per_kg +
            reanalysis_tdtr_k + station_avg_temp_c +
            station_diur_temp_rng_c +
            station_max_temp_c +
            station_min_temp_c +
            station_precip_mm,
          sj_train_labels.startweek,
          importance = TRUE, 
          ntree=1000)
 
#How many trees are needed to reach the minimum error estimate? 
which.min(model_sj.rf$mse)
## [1] 665
plot(model_sj.rf) 

#Find the importance of the RF model
imp <- as.data.frame(sort(importance(model_sj.rf)[,1],decreasing = TRUE),optional = T)
names(imp) <- "% Inc MSE"
imp
##                                       % Inc MSE
## ndvi_se                               37.701045
## ndvi_sw                               23.312328
## reanalysis_dew_point_temp_k           13.287574
## reanalysis_specific_humidity_g_per_kg 12.960603
## ndvi_nw                               12.753006
## ndvi_ne                               12.341615
## station_precip_mm                     12.207094
## reanalysis_min_air_temp_k             12.109047
## reanalysis_relative_humidity_percent  10.933707
## reanalysis_precip_amt_kg_per_m2       10.373380
## reanalysis_tdtr_k                      8.822743
## reanalysis_max_air_temp_k              8.705044
## station_avg_temp_c                     7.945887
## reanalysis_air_temp_k                  7.590120
## reanalysis_sat_precip_amt_mm           7.526884
## station_min_temp_c                     7.114981
## reanalysis_avg_temp_k                  6.957742
## station_max_temp_c                     5.571364
## precipitation_amt_mm                   5.289672
## station_diur_temp_rng_c                4.998066
#graph the importance
varImpPlot(model_sj.rf, type = 1)

varImpPlot(model_sj.rf, type = 2)

rm(model_sj.rf, imp)

Random Forest for IQ

library(reshape2)
library(ggplot2)
library(randomForest)
library(caret)

#Fit a model
 
model_iq.rf <- randomForest(iq_train_labels.startweek$total_cases ~ 
          ndvi_ne +
            ndvi_nw +
            ndvi_se +
            ndvi_sw +
            precipitation_amt_mm +
            reanalysis_air_temp_k +
            reanalysis_avg_temp_k + 
            reanalysis_dew_point_temp_k +
            reanalysis_max_air_temp_k +
            reanalysis_min_air_temp_k +
            reanalysis_precip_amt_kg_per_m2 +
            reanalysis_relative_humidity_percent +
            reanalysis_sat_precip_amt_mm +
            reanalysis_specific_humidity_g_per_kg +
            reanalysis_tdtr_k + station_avg_temp_c +
            station_diur_temp_rng_c +
            station_max_temp_c +
            station_min_temp_c +
            station_precip_mm,
          iq_train_labels.startweek,
          importance = TRUE, 
          ntree=1000)
 
#How many trees are needed to reach the minimum error estimate? 
which.min(model_iq.rf$mse)
## [1] 880
plot(model_iq.rf) 

#Find the importance of the RF model
imp <- as.data.frame(sort(importance(model_iq.rf)[,1],decreasing = TRUE),optional = T)
names(imp) <- "% Inc MSE"
imp
##                                       % Inc MSE
## reanalysis_specific_humidity_g_per_kg 14.598346
## station_avg_temp_c                    13.816486
## reanalysis_precip_amt_kg_per_m2       12.857407
## reanalysis_dew_point_temp_k           12.609141
## station_max_temp_c                    12.320637
## reanalysis_relative_humidity_percent  10.737072
## reanalysis_tdtr_k                      9.481949
## reanalysis_min_air_temp_k              8.021002
## station_diur_temp_rng_c                7.831725
## reanalysis_max_air_temp_k              7.098665
## reanalysis_air_temp_k                  7.072691
## reanalysis_avg_temp_k                  6.403049
## ndvi_ne                                5.636981
## ndvi_sw                                5.358412
## reanalysis_sat_precip_amt_mm           4.912512
## ndvi_se                                4.140040
## precipitation_amt_mm                   4.040661
## station_precip_mm                      2.886239
## station_min_temp_c                     2.712698
## ndvi_nw                                1.982185
#graph the importance
varImpPlot(model_iq.rf, type = 1)

varImpPlot(model_iq.rf, type = 2)

rm(model_iq.rf, imp)

PEAK MODEL ANALYSIS

Set up the peak analysis

Determine the highest total_cases per week in a year

#calculate the max value by year and sort by highest cases
max_cases.year <-sort(tapply(sj_train_labels.lastna$total_cases, sj_train_labels.lastna$year, max), decreasing = TRUE)

max_cases.year
## 1994 1998 2007 1991 1995 2005 1997 1992 1999 2001 1990 1993 2003 2000 2002 
##  461  329  170  169  154  137  112  104   77   75   71   46   41   38   38 
## 1996 2006 2004 2008 
##   35   33   27   15
#Determine which weeks are associated to which max year values
dname <- dimnames(max_cases.year)

for (i in 1:6) {
  max_cases.week <-
    sort(tapply(sj_train_labels.lastna$total_cases[sj_train_labels.lastna$year == as.numeric(dname[[1]][i])], sj_train_labels.lastna$weekofyear[sj_train_labels.lastna$year == as.numeric(dname[[1]][i])], max), decreasing = TRUE)

print(dname[[1]][i])
print(max_cases.week[1:15])
}
## [1] "1994"
##  41  40  45  39  42  46  47  44  43  38  48  37  49  36  35 
## 461 426 410 395 381 364 359 353 333 302 288 272 221 202 179 
## [1] "1998"
##  32  33  31  34  35  30  36  29  27  28  41  26  40  37  51 
## 329 263 256 220 204 191 181 150 128 127 127 102 102  99  89 
## [1] "2007"
##  40  41  37  38  42  39  35  34  33  32  36  43  44  45  30 
## 170 135 112 106 106 101  92  76  75  72  71  68  48  48  42 
## [1] "1991"
##  48  42  49  40  44  45  43  47  46  41  50  39  38  51  34 
## 169 142 141 140 140 140 129 129 127 116 108  92  89  78  76 
## [1] "1995"
##  52   1   2   3  43   4  36  44   5  42  46  48  37  39  38 
## 154  91  72  56  48  46  40  40  37  36  36  34  33  33  31 
## [1] "2005"
##  35  36  33  34  37  31  32  38  30  39  29  41  42  43  44 
## 137 131 126 119 112  83  82  82  75  73  56  55  55  53  46
rm(dname, i, max_cases.week, max_cases.year)

Create dataframe with peaks only

Five peaks will be isolated from each city with 5 weeks around each side of the max yearly value. A new dataframe will be made for use in mutual information and then for prediction.

Add a binary variable “peak” for logistic regression purposes (peak = 1)

sj.peak.1994 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 1994 & sj_train_labels.lastna$weekofyear <= 46 & sj_train_labels.lastna$weekofyear >= 36 ,]

sj.peak.1998 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 1998 & sj_train_labels.lastna$weekofyear <= 47 & sj_train_labels.lastna$weekofyear >= 37 ,]

sj.peak.2007 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 2007 & sj_train_labels.lastna$weekofyear <= 45 & sj_train_labels.lastna$weekofyear >= 35 ,]

sj.peak.1991 <- rbind(sj_train_labels.lastna[sj_train_labels.lastna$year == 1991 & sj_train_labels.lastna$weekofyear <= 52 & sj_train_labels.lastna$weekofyear >= 43 ,],
                      sj_train_labels.lastna[sj_train_labels.lastna$year == 1992 & sj_train_labels.lastna$weekofyear == 1 ,] )

sj.peak.2005 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 2005 & sj_train_labels.lastna$weekofyear <= 40 & sj_train_labels.lastna$weekofyear >= 30 ,]

sj.peak <-rbind(sj.peak.1991,sj.peak.1994,sj.peak.1998,sj.peak.2005,sj.peak.2007)

sj.peak$peak <- 1

rm(sj.peak.1991,sj.peak.1994,sj.peak.1998,sj.peak.2005,sj.peak.2007)

Create dataframe with non-peaks

Add a binary variable “nonpeak” for logistic regression purposes (peak = 0)

sj.nonpeak.1997 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 1997 & sj_train_labels.lastna$weekofyear <= 22 & sj_train_labels.lastna$weekofyear >= 12 ,]

sj.nonpeak.2001 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 2001 & sj_train_labels.lastna$weekofyear <= 22 & sj_train_labels.lastna$weekofyear >= 12 ,]

sj.nonpeak.2003 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 2003 & sj_train_labels.lastna$weekofyear <= 22 & sj_train_labels.lastna$weekofyear >= 12 ,]

sj.nonpeak.1993 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 1993 & sj_train_labels.lastna$weekofyear <= 22 & sj_train_labels.lastna$weekofyear >= 12 ,]

sj.nonpeak.1996 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 1996 & sj_train_labels.lastna$weekofyear <= 22 & sj_train_labels.lastna$weekofyear >= 12 ,]

sj.nonpeak <-rbind(sj.nonpeak.1997,
                   sj.nonpeak.2001,
                   sj.nonpeak.2003,
                   sj.nonpeak.1993,
                   sj.nonpeak.1996)

rm(sj.nonpeak.1997,sj.nonpeak.2001,sj.nonpeak.2003,sj.nonpeak.1993,sj.nonpeak.1996)

sj.nonpeak$peak <- 0

Feature selection using logistic regression

Build and test a logistic regression model for the peaks

sj_peak.glm <- rbind(sj.peak, sj.nonpeak)

# Fit a logistic regression model
fit_glm <- glm(sj_peak.glm$peak ~ .,
               sj_peak.glm,
               family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# generate summary
 
summary(fit_glm)
## 
## Call:
## glm(formula = sj_peak.glm$peak ~ ., family = "binomial", data = sj_peak.glm)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.661e-05  -2.110e-08   0.000e+00   2.110e-08   1.585e-05  
## 
## Coefficients: (1 not defined because of singularities)
##                                         Estimate Std. Error z value
## (Intercept)                            8.124e+03  7.892e+07       0
## year                                  -1.285e+00  2.564e+04       0
## weekofyear                             5.992e-01  1.313e+04       0
## ndvi_ne                                5.914e+01  1.025e+06       0
## ndvi_nw                               -3.671e+01  1.408e+06       0
## ndvi_se                                2.318e+02  3.178e+06       0
## ndvi_sw                               -2.042e+02  3.702e+06       0
## precipitation_amt_mm                  -1.663e-01  1.923e+03       0
## reanalysis_air_temp_k                 -1.604e+02  2.583e+06       0
## reanalysis_avg_temp_k                 -4.632e+01  9.555e+05       0
## reanalysis_dew_point_temp_k            1.302e+02  2.787e+06       0
## reanalysis_max_air_temp_k             -1.354e+01  1.593e+05       0
## reanalysis_min_air_temp_k             -4.127e+00  2.013e+05       0
## reanalysis_precip_amt_kg_per_m2        2.574e-01  2.709e+03       0
## reanalysis_relative_humidity_percent  -5.288e+01  5.464e+05       0
## reanalysis_sat_precip_amt_mm                  NA         NA      NA
## reanalysis_specific_humidity_g_per_kg  1.277e+02  1.006e+06       0
## reanalysis_tdtr_k                      4.768e+00  1.680e+05       0
## station_avg_temp_c                    -1.567e+01  3.547e+05       0
## station_diur_temp_rng_c               -1.341e+01  1.116e+05       0
## station_max_temp_c                     8.618e+00  1.346e+05       0
## station_min_temp_c                    -1.397e+01  1.565e+05       0
## station_precip_mm                     -1.428e-01  2.519e+03       0
## total_cases                            2.963e-01  1.900e+03       0
##                                       Pr(>|z|)
## (Intercept)                                  1
## year                                         1
## weekofyear                                   1
## ndvi_ne                                      1
## ndvi_nw                                      1
## ndvi_se                                      1
## ndvi_sw                                      1
## precipitation_amt_mm                         1
## reanalysis_air_temp_k                        1
## reanalysis_avg_temp_k                        1
## reanalysis_dew_point_temp_k                  1
## reanalysis_max_air_temp_k                    1
## reanalysis_min_air_temp_k                    1
## reanalysis_precip_amt_kg_per_m2              1
## reanalysis_relative_humidity_percent         1
## reanalysis_sat_precip_amt_mm                NA
## reanalysis_specific_humidity_g_per_kg        1
## reanalysis_tdtr_k                            1
## station_avg_temp_c                           1
## station_diur_temp_rng_c                      1
## station_max_temp_c                           1
## station_min_temp_c                           1
## station_precip_mm                            1
## total_cases                                  1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.5249e+02  on 109  degrees of freedom
## Residual deviance: 1.7801e-09  on  87  degrees of freedom
## AIC: 46
## 
## Number of Fisher Scoring iterations: 25
rm(sj_peak.glm, fit_glm)

Feature selection using Mutual Information

This analysis looks at the mutual information for a peak prediction model. Five peaks will be used

Determine mutual info for the new dataframe with peaks

library(entropy)
library(infotheo)
## 
## Attaching package: 'infotheo'
## The following objects are masked from 'package:entropy':
## 
##     discretize, entropy
mu <- data.frame()

cnames <- colnames(sj.peak)
for (i in 5:(ncol(sj.peak)-2)) {
  disc1 <- discretize(sj.peak$total_cases)
  disc2 <- discretize(sj.peak[,i])
  mu[i-4,1] <- cnames[i]
  mu[i-4,2] <- mutinformation(disc1, disc2)
}
mu[order(mu$V2, decreasing = TRUE),]
##                                       V1         V2
## 1                                ndvi_se 0.22008822
## 15               station_diur_temp_rng_c 0.10178915
## 6            reanalysis_dew_point_temp_k 0.08947514
## 12 reanalysis_specific_humidity_g_per_kg 0.07098685
## 7              reanalysis_max_air_temp_k 0.06446396
## 18                     station_precip_mm 0.05977442
## 8              reanalysis_min_air_temp_k 0.05513424
## 4                  reanalysis_air_temp_k 0.05412362
## 17                    station_min_temp_c 0.05227911
## 5                  reanalysis_avg_temp_k 0.04437999
## 13                     reanalysis_tdtr_k 0.04282410
## 10  reanalysis_relative_humidity_percent 0.03870491
## 9        reanalysis_precip_amt_kg_per_m2 0.03184109
## 16                    station_max_temp_c 0.03036126
## 2                                ndvi_sw 0.02205445
## 14                    station_avg_temp_c 0.02192181
## 3                   precipitation_amt_mm 0.01964312
## 11          reanalysis_sat_precip_amt_mm 0.01964312
rm(mu, cnames, i, disc1, disc2)

Determine mutual info for the new dataframe with nonpeaks

library(entropy)
library(infotheo)

mu <- data.frame()

cnames <- colnames(sj.nonpeak)
for (i in 5:(ncol(sj.nonpeak)-2)) {
  disc1 <- discretize(sj.nonpeak$total_cases)
  disc2 <- discretize(sj.nonpeak[,i])
  mu[i-4,1] <- cnames[i]
  mu[i-4,2] <- mutinformation(disc1, disc2)
}
mu[order(mu$V2, decreasing = TRUE),]
##                                       V1         V2
## 1                                ndvi_se 0.09648317
## 5                  reanalysis_avg_temp_k 0.06407748
## 4                  reanalysis_air_temp_k 0.06179880
## 2                                ndvi_sw 0.05977442
## 9        reanalysis_precip_amt_kg_per_m2 0.04481389
## 8              reanalysis_min_air_temp_k 0.04417511
## 10  reanalysis_relative_humidity_percent 0.04358990
## 18                     station_precip_mm 0.03305411
## 17                    station_min_temp_c 0.02549627
## 14                    station_avg_temp_c 0.02374231
## 13                     reanalysis_tdtr_k 0.02148998
## 7              reanalysis_max_air_temp_k 0.01964312
## 15               station_diur_temp_rng_c 0.01804743
## 3                   precipitation_amt_mm 0.01703681
## 11          reanalysis_sat_precip_amt_mm 0.01703681
## 16                    station_max_temp_c 0.01473814
## 12 reanalysis_specific_humidity_g_per_kg 0.01138601
## 6            reanalysis_dew_point_temp_k 0.01076950
rm(mu, cnames, i, disc1, disc2)

Feature selection using Naive Bayes

Build and test a Naive Bayes model for the peaks

library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
sj_peak.nb <- rbind(sj.peak, sj.nonpeak)

# Fit a Naive Bayes model
fit_nb <- naiveBayes(sj_peak.nb$total_cases ~ ., sj_peak.nb)
 
# generate summary

summary(fit_nb)
##         Length Class  Mode   
## apriori 62     table  numeric
## tables  23     -none- list   
## levels   0     -none- NULL   
## call     4     -none- call
#remove fit_nb
rm(fit_nb, sj_peak.nb)

Remove peak and non-peak dataframes

rm(sj.peak, sj.nonpeak)

PREDICTIVE MODELS

The following predictive models will reivew the Root Square Mean Error by each city. This is done without feature selection and with missing values imputed as the last non-na values.

80/20Training and validation set

SJ: training and validation

set.seed(136) 
 
# randomly pick 80% of the number of observations
index.sj <- sample(1:nrow(sj_train_labels.startweek),size = 0.8*nrow(sj_train_labels.startweek)) 
 
# subset train_labels to include only the elements in the index
train.sj <- sj_train_labels.startweek[index.sj,] 
 
# subset train_labels to include all but the elements in the index
validation.sj <- sj_train_labels.startweek[-index.sj,] 
 
nrow(train.sj)
## [1] 748
nrow(validation.sj)
## [1] 188
# # Create a dataframe with train and test indicator...
# group <- rep(NA,nrow(sj_train_labels.startweek))
# 
# group <- ifelse(seq(1,nrow(sj_train_labels.startweek)) %in% index,"Train","Validation")
# 
# df <- data.frame(date=sj_train_labels.startweek$week_start_date,cases=sj_train_labels.startweek$total_cases,group)
# 
# # ...and plot it
# ggplot(df,aes(x = date,y = cases, color = group)) + geom_point() +
#   scale_color_discrete(name="") + theme(legend.position="top")

rm(index.sj)

IQ: training and validation

set.seed(136) 
 
# randomly pick 80% of the number of observations
index.iq <- sample(1:nrow(iq_train_labels.startweek),size = 0.8*nrow(iq_train_labels.startweek)) 
 
# subset train_labels to include only the elements in the index
train.iq <- iq_train_labels.startweek[index.iq,] 
 
# subset train_labels to include all but the elements in the index
validation.iq <- iq_train_labels.startweek[-index.iq,] 
 
nrow(train.iq)
## [1] 416
nrow(validation.iq)
## [1] 104
rm(index.iq)

Baseline models

Baseline model 1 for SJ

The baseline model shifts the total_cases down by one so that the values fall down to the next week. The difference between the orignal and the shifted values are taken and the RMSE is used as the metric to measure performance.

#create a new data frame from the lastna dataframe
sj_train_labels.shift <- sj_train_labels.lastna

#Make a copy of the total_cases variable
sj_train_labels.shift$total_cases2 <- sj_train_labels.shift$total_cases

#shift the values down by one
sj_train_labels.shift['total_cases2'] <- c(NA, head(sj_train_labels.shift['total_cases2'], dim(sj_train_labels.shift)[1] - 1)[[1]])

#replace the first NA with zero
sj_train_labels.shift$total_cases2[1] <- 0

#take the difference between total_cases and total_cases2
sj_train_labels.shift$diff <- sj_train_labels.shift$total_cases2 - sj_train_labels.shift$total_cases

# Evaluate RMSE and MAE on the validation data
RMSE.SJ.baseline1 <- sqrt(mean((sj_train_labels.shift$diff)^2))
RMSE.SJ.baseline1
## [1] 15.34033
MAE.SJ.baseline1 <- mean(abs(sj_train_labels.shift$diff))
MAE.SJ.baseline1
## [1] 8.394231
rm(sj_train_labels.shift)

Baseline model 1 for IQ

The baseline model shifts the total_cases down by one so that the values fall down to the next week. The difference between the orignal and the shifted values are taken and the RMSE is used as the metric to measure performance.

#create a new data frame from the lastna dataframe
iq_train_labels.shift <- iq_train_labels.lastna

#Make a copy of the total_cases variable
iq_train_labels.shift$total_cases2 <- iq_train_labels.shift$total_cases

#shift the values down by one
iq_train_labels.shift['total_cases2'] <- c(NA, head(iq_train_labels.shift['total_cases2'], dim(iq_train_labels.shift)[1] - 1)[[1]])

#replace the first NA with zero
iq_train_labels.shift$total_cases2[1] <- 0

#take the difference between total_cases and total_cases2
iq_train_labels.shift$diff <- iq_train_labels.shift$total_cases2 - iq_train_labels.shift$total_cases

# Evaluate RMSE and MAE on the validation data
RMSE.IQ.baseline1 <- sqrt(mean((iq_train_labels.shift$diff)^2))
RMSE.IQ.baseline1
## [1] 7.660086
MAE.IQ.baseline1 <- mean(abs(iq_train_labels.shift$diff))
MAE.IQ.baseline1
## [1] 3.930769
rm(iq_train_labels.shift)

Baseline model 2 for SJ

#Here is a plot showing which points belong to which set (train or test).
library(ggplot2)

# Baseline model - predict the mean of the training data
best.guess.sj <- mean(train.sj$total_cases)
 
# Evaluate RMSE and MAE on the validation data
RMSE.SJ.baseline2 <- sqrt(mean((best.guess.sj-validation.sj$total_cases)^2))
RMSE.SJ.baseline2
## [1] 44.41748
MAE.SJ.baseline2 <- mean(abs(best.guess.sj-validation.sj$total_cases))
MAE.SJ.baseline2
## [1] 27.03268
rm(best.guess.sj)

Baseline model 2 for IQ

#Here is a plot showing which points belong to which set (train or test).

library(ggplot2)

# Baseline model - predict the mean of the training data
best.guess.iq <- mean(train.iq$total_cases)
 
# Evaluate RMSE and MAE on the validation data
RMSE.IQ.baseline2 <- sqrt(mean((best.guess.iq-validation.iq$total_cases)^2))
RMSE.IQ.baseline2
## [1] 11.6964
MAE.IQ.baseline2 <- mean(abs(best.guess.iq-validation.iq$total_cases))
MAE.IQ.baseline2
## [1] 7.383275
rm(best.guess.iq)

Negative binomial regression (NBR)

NBR for SJ

library(MASS)
library(reshape2)
library(ggplot2)

#determine the dispersion of the total_cases

round(with(sj_train_labels.startweek, mean(total_cases),2))
## [1] 34
round(with(sj_train_labels.startweek, var(total_cases),2))
## [1] 2640
#As there is over-dispersion of total_cases (variance is greater than the mean) we can go ahead and build the NBR model 

#Build the model
model_sj.nbr <- glm.nb(formula = total_cases ~ ., data = train.sj[,4:24])

summary(model_sj.nbr)
## 
## Call:
## glm.nb(formula = total_cases ~ ., data = train.sj[, 4:24], init.theta = 1.238752535, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6005  -1.0132  -0.4303   0.2511   3.7686  
## 
## Coefficients: (1 not defined because of singularities)
##                                         Estimate Std. Error z value
## (Intercept)                           -36.095981  35.906521  -1.005
## ndvi_ne                                -0.075816   0.439385  -0.173
## ndvi_nw                                 1.745488   0.506357   3.447
## ndvi_se                                -6.437402   0.998317  -6.448
## ndvi_sw                                 5.462943   1.033934   5.284
## precipitation_amt_mm                   -0.002296   0.001069  -2.148
## reanalysis_air_temp_k                   2.859593   1.787091   1.600
## reanalysis_avg_temp_k                  -0.882312   0.449115  -1.965
## reanalysis_dew_point_temp_k            -2.793170   1.662257  -1.680
## reanalysis_max_air_temp_k               0.301373   0.106441   2.831
## reanalysis_min_air_temp_k              -0.052397   0.112896  -0.464
## reanalysis_precip_amt_kg_per_m2         0.001093   0.001273   0.858
## reanalysis_relative_humidity_percent    0.400739   0.369438   1.085
## reanalysis_sat_precip_amt_mm                  NA         NA      NA
## reanalysis_specific_humidity_g_per_kg   1.028983   0.466649   2.205
## reanalysis_tdtr_k                      -0.608646   0.118727  -5.126
## station_avg_temp_c                     -0.222566   0.114877  -1.937
## station_diur_temp_rng_c                 0.055151   0.071591   0.770
## station_max_temp_c                      0.041047   0.057318   0.716
## station_min_temp_c                     -0.007992   0.070262  -0.114
## station_precip_mm                      -0.001965   0.001690  -1.163
##                                       Pr(>|z|)    
## (Intercept)                           0.314764    
## ndvi_ne                               0.863005    
## ndvi_nw                               0.000567 ***
## ndvi_se                               1.13e-10 ***
## ndvi_sw                               1.27e-07 ***
## precipitation_amt_mm                  0.031687 *  
## reanalysis_air_temp_k                 0.109568    
## reanalysis_avg_temp_k                 0.049465 *  
## reanalysis_dew_point_temp_k           0.092890 .  
## reanalysis_max_air_temp_k             0.004635 ** 
## reanalysis_min_air_temp_k             0.642565    
## reanalysis_precip_amt_kg_per_m2       0.390702    
## reanalysis_relative_humidity_percent  0.278043    
## reanalysis_sat_precip_amt_mm                NA    
## reanalysis_specific_humidity_g_per_kg 0.027451 *  
## reanalysis_tdtr_k                     2.95e-07 ***
## station_avg_temp_c                    0.052694 .  
## station_diur_temp_rng_c               0.441080    
## station_max_temp_c                    0.473911    
## station_min_temp_c                    0.909437    
## station_precip_mm                     0.244969    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.2388) family taken to be 1)
## 
##     Null deviance: 1085.5  on 747  degrees of freedom
## Residual deviance:  827.3  on 728  degrees of freedom
## AIC: 6640.9
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.2388 
##           Std. Err.:  0.0614 
## 
##  2 x log-likelihood:  -6598.8710
prediction_sj.nbr <-  predict(model_sj.nbr, validation.sj, type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
#Plot the prediction for NBR
df_prediction_sj.nbr <- data.frame('prediction' = prediction_sj.nbr,
                                   'actual' = validation.sj$total_cases,
                                   'time' = validation.sj$week_start_date)

df_prediction_sj.nbr <- melt(df_prediction_sj.nbr, id.vars = 'time')

ggplot(df_prediction_sj.nbr, aes(x = time, y = value, color = variable)) +
  geom_line() +
  ggtitle('NBR: Dengue predicted Cases vs. Actual Cases (City-San Juan) ')

# Evaluate RMSE and MAE on the validation data
RMSE.SJ.nbr <- sqrt(mean((prediction_sj.nbr-validation.sj$total_cases)^2))
RMSE.SJ.nbr
## [1] 42.50416
MAE.SJ.nbr <- mean(abs(prediction_sj.nbr-validation.sj$total_cases))
MAE.SJ.nbr
## [1] 26.15555
rm(df_prediction_sj.nbr, model_sj.nbr, prediction_sj.nbr)

NBR for IQ

library(MASS)
library(reshape2)
library(ggplot2)

#determine the dispersion of the total_cases

round(with(iq_train_labels.startweek, mean(total_cases),2))
## [1] 8
round(with(iq_train_labels.startweek, var(total_cases),2))
## [1] 116
#As there is over-dispersion of total_cases (variance is greater than the mean) we can go ahead and build the NBR model 

#Build the model
model_iq.nbr <- glm.nb(formula = total_cases ~ ., data = train.iq[,4:24])

summary(model_iq.nbr)
## 
## Call:
## glm.nb(formula = total_cases ~ ., data = train.iq[, 4:24], init.theta = 0.8927711016, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3298  -1.1448  -0.3802   0.3018   3.6056  
## 
## Coefficients: (1 not defined because of singularities)
##                                         Estimate Std. Error z value
## (Intercept)                           -3.9555345 14.4163166  -0.274
## ndvi_ne                                2.0225152  1.5012052   1.347
## ndvi_nw                                1.3923607  1.2471222   1.116
## ndvi_se                               -4.5143238  1.1676946  -3.866
## ndvi_sw                               -0.3853344  1.3088897  -0.294
## precipitation_amt_mm                  -0.0004742  0.0019193  -0.247
## reanalysis_air_temp_k                  0.3821218  0.6047721   0.632
## reanalysis_avg_temp_k                 -0.1713872  0.2741650  -0.625
## reanalysis_dew_point_temp_k           -1.4903126  0.8783992  -1.697
## reanalysis_max_air_temp_k             -0.0937201  0.0551876  -1.698
## reanalysis_min_air_temp_k              0.0119852  0.0798743   0.150
## reanalysis_precip_amt_kg_per_m2       -0.0023324  0.0014441  -1.615
## reanalysis_relative_humidity_percent   0.0330521  0.1314110   0.252
## reanalysis_sat_precip_amt_mm                  NA         NA      NA
## reanalysis_specific_humidity_g_per_kg  1.5909235  0.7529229   2.113
## reanalysis_tdtr_k                      0.0354126  0.0897535   0.395
## station_avg_temp_c                     0.0127747  0.1083638   0.118
## station_diur_temp_rng_c                0.0277937  0.0668043   0.416
## station_max_temp_c                     0.1462946  0.0704404   2.077
## station_min_temp_c                     0.0706209  0.0738444   0.956
## station_precip_mm                      0.0009194  0.0009481   0.970
##                                       Pr(>|z|)    
## (Intercept)                           0.783793    
## ndvi_ne                               0.177896    
## ndvi_nw                               0.264226    
## ndvi_se                               0.000111 ***
## ndvi_sw                               0.768454    
## precipitation_amt_mm                  0.804843    
## reanalysis_air_temp_k                 0.527489    
## reanalysis_avg_temp_k                 0.531889    
## reanalysis_dew_point_temp_k           0.089768 .  
## reanalysis_max_air_temp_k             0.089468 .  
## reanalysis_min_air_temp_k             0.880725    
## reanalysis_precip_amt_kg_per_m2       0.106282    
## reanalysis_relative_humidity_percent  0.801414    
## reanalysis_sat_precip_amt_mm                NA    
## reanalysis_specific_humidity_g_per_kg 0.034601 *  
## reanalysis_tdtr_k                     0.693172    
## station_avg_temp_c                    0.906157    
## station_diur_temp_rng_c               0.677376    
## station_max_temp_c                    0.037815 *  
## station_min_temp_c                    0.338897    
## station_precip_mm                     0.332217    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.8928) family taken to be 1)
## 
##     Null deviance: 561.45  on 415  degrees of freedom
## Residual deviance: 474.29  on 396  degrees of freedom
## AIC: 2495.4
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.8928 
##           Std. Err.:  0.0737 
## 
##  2 x log-likelihood:  -2453.4450
prediction_iq.nbr <-  predict(model_iq.nbr, validation.iq, type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
#Plot the prediction for NBR
df_prediction_iq.nbr <- data.frame('prediction' = prediction_iq.nbr,
                                   'actual' = validation.iq$total_cases,
                                   'time' = validation.iq$week_start_date)

df_prediction_iq.nbr <- melt(df_prediction_iq.nbr, id.vars = 'time')

ggplot(df_prediction_iq.nbr, aes(x = time, y = value, color = variable)) +
  geom_line() +
  ggtitle('NBR: Dengue predicted Cases vs. Actual Cases (City-IQUITOS) ')

# Evaluate RMSE and MAE on the validation data
RMSE.IQ.nbr <- sqrt(mean((prediction_iq.nbr-validation.iq$total_cases)^2))
RMSE.IQ.nbr
## [1] 11.934
MAE.IQ.nbr <- mean(abs(prediction_iq.nbr-validation.iq$total_cases))
MAE.IQ.nbr
## [1] 7.742887
rm(df_prediction_iq.nbr, model_iq.nbr, prediction_iq.nbr)

Support Vector Machines

SVM for SJ

library(kernlab)
## Warning: package 'kernlab' was built under R version 3.4.4
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## The following object is masked from 'package:psych':
## 
##     alpha
library(reshape2)
library(ggplot2)

#Build the model
model_sj.svm <- ksvm(total_cases ~  ., data = train.sj, kernel = "vanilladot")
##  Setting default kernel parameters
prediction_sj.svm <-  predict(model_sj.svm, validation.sj)

#Plot the prediction for NBR
df_prediction_sj.svm <- data.frame('prediction' = prediction_sj.svm,
                                   'actual' = validation.sj$total_cases,
                                   'time' = validation.sj$week_start_date)

df_prediction_sj.svm <- melt(df_prediction_sj.svm, id.vars = 'time')

ggplot(df_prediction_sj.svm, aes(x = time, y = value, color = variable)) +
  geom_line() +
  ggtitle('SVM: Dengue predicted Cases vs. Actual Cases (City-San Juan) ')

# Evaluate RMSE and MAE on the validation data
RMSE.SJ.svm <- sqrt(mean((prediction_sj.svm-validation.sj$total_cases)^2))
RMSE.SJ.svm
## [1] 42.23424
MAE.SJ.svm <- mean(abs(prediction_sj.svm-validation.sj$total_cases))
MAE.SJ.svm
## [1] 19.76721
rm(df_prediction_sj.svm, model_sj.svm, prediction_sj.svm)

SVM for IQ

library(kernlab)
library(reshape2)
library(ggplot2)

#Build the model
model_iq.svm <- ksvm(total_cases ~  ., data = train.iq, kernel = "vanilladot")
##  Setting default kernel parameters
prediction_iq.svm <-  predict(model_iq.svm, validation.iq)

#Plot the prediction for NBR
df_prediction_iq.svm <- data.frame('prediction' = prediction_iq.svm,
                                   'actual' = validation.iq$total_cases,
                                   'time' = validation.iq$week_start_date)

df_prediction_iq.svm <- melt(df_prediction_iq.svm, id.vars = 'time')

ggplot(df_prediction_iq.svm, aes(x = time, y = value, color = variable)) +
  geom_line() +
  ggtitle('SVM: Dengue predicted Cases vs. Actual Cases (City-Iquitos) ')

# Evaluate RMSE and MAE on the validation data
RMSE.IQ.svm <- sqrt(mean((prediction_iq.svm-validation.iq$total_cases)^2))
RMSE.IQ.svm
## [1] 11.95082
MAE.IQ.svm <- mean(abs(prediction_iq.svm-validation.iq$total_cases))
MAE.IQ.svm
## [1] 6.398152
rm(df_prediction_iq.svm, model_iq.svm, prediction_iq.svm)

Random Forest

RF without CV for SJ

library(reshape2)
library(ggplot2)
library(randomForest)
library(caret)

set.seed(136)

#Build the model
model_sj.rf <- randomForest(formula = total_cases ~ ., data = train.sj[,4:24])

model_sj.rf
## 
## Call:
##  randomForest(formula = total_cases ~ ., data = train.sj[, 4:24]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 1444.554
##                     % Var explained: 48.49
prediction_sj.rf <-  predict(model_sj.rf, validation.sj, type = 'response')

#Plot the prediction for NBR
df_prediction_sj.rf <- data.frame('prediction' = prediction_sj.rf,
                                   'actual' = validation.sj$total_cases,
                                   'time' = validation.sj$week_start_date)

df_prediction_sj.rf <- melt(df_prediction_sj.rf, id.vars = 'time')

ggplot(df_prediction_sj.rf, aes(x = time, y = value, color = variable)) +
  geom_line() +
  ggtitle('RF: Dengue predicted Cases vs. Actual Cases (City-San Juan) ')

# Evaluate RMSE and MAE on the validation data
RMSE.SJ.rf <- sqrt(mean((prediction_sj.rf-validation.sj$total_cases)^2))
RMSE.SJ.rf
## [1] 34.51898
MAE.SJ.rf <- mean(abs(prediction_sj.rf-validation.sj$total_cases))
MAE.SJ.rf
## [1] 24.67019
rm(df_prediction_sj.rf, model_sj.rf, prediction_sj.rf)

RF without CV for IQ

library(reshape2)
library(ggplot2)
library(randomForest)
library(caret)

set.seed(136)

#Build the model
model_iq.rf <- randomForest(formula = total_cases ~ ., data = train.iq[,4:24])

model_iq.rf
## 
## Call:
##  randomForest(formula = total_cases ~ ., data = train.iq[, 4:24]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 106.1351
##                     % Var explained: 3.88
prediction_iq.rf <-  predict(model_iq.rf, validation.iq, type = 'response')

#Plot the prediction for NBR
df_prediction_iq.rf <- data.frame('prediction' = prediction_iq.rf,
                                   'actual' = validation.iq$total_cases,
                                   'time' = validation.iq$week_start_date)

df_prediction_iq.rf <- melt(df_prediction_iq.rf, id.vars = 'time')

ggplot(df_prediction_iq.rf, aes(x = time, y = value, color = variable)) +
  geom_line() +
  ggtitle('RF: Dengue predicted Cases vs. Actual Cases (City-IQUITOS) ')

# Evaluate RMSE and MAE on the validation data
RMSE.IQ.rf <- sqrt(mean((prediction_iq.rf-validation.iq$total_cases)^2))
RMSE.IQ.rf
## [1] 11.67287
MAE.IQ.rf <- mean(abs(prediction_iq.rf-validation.iq$total_cases))
MAE.IQ.rf
## [1] 7.544626
rm(df_prediction_iq.rf, model_iq.rf, prediction_iq.rf)

Multi Layer Perceptron

MLP for SJ

MLP for IQ

EVALUATION OF PREDICTIVE MODELS

MAE and RMSE for Predictive Models

MAE and RMSE for SJ

# Create a data frame with the error metrics for each method
accuracy <- data.frame(Method = c("Baseline1",
                                  "Baseline2",
                                  "NB Regression",
                                  "SVM",
                                  "Random forest"),
                       RMSE   = c(RMSE.SJ.baseline1,
                                  RMSE.SJ.baseline2,
                                  RMSE.SJ.svm,
                                  RMSE.SJ.nbr,
                                  RMSE.SJ.rf),
                       MAE    = c(MAE.SJ.baseline1,
                                  MAE.SJ.baseline2,
                                  MAE.SJ.svm,
                                  MAE.SJ.nbr,
                                  MAE.SJ.rf)) 
 
# Round the values and print the table
accuracy$RMSE <- round(accuracy$RMSE,2)
accuracy$MAE <- round(accuracy$MAE,2) 
 
accuracy
##          Method  RMSE   MAE
## 1     Baseline1 15.34  8.39
## 2     Baseline2 44.42 27.03
## 3 NB Regression 42.23 19.77
## 4           SVM 42.50 26.16
## 5 Random forest 34.52 24.67
rm(accuracy, 
   RMSE.SJ.baseline1,
   RMSE.SJ.baseline2,
   RMSE.SJ.svm,
   RMSE.SJ.nbr,
   RMSE.SJ.rf,
   MAE.SJ.baseline1,
   MAE.SJ.baseline2,
   MAE.SJ.svm,
   MAE.SJ.nbr,
   MAE.SJ.rf)

MAE and RMSE for IQ

# Create a data frame with the error metrics for each method
accuracy <- data.frame(Method = c("Baseline1",
                                  "Baseline2",
                                  "NB Regression",
                                  "SVM",
                                  "Random forest"),
                       RMSE   = c(RMSE.IQ.baseline1,
                                  RMSE.IQ.baseline2,
                                  RMSE.IQ.svm,
                                  RMSE.IQ.nbr,
                                  RMSE.IQ.rf),
                       MAE    = c(MAE.IQ.baseline1,
                                  MAE.IQ.baseline2,
                                  MAE.IQ.svm,
                                  MAE.IQ.nbr,
                                  MAE.IQ.rf)) 
 
# Round the values and print the table
accuracy$RMSE <- round(accuracy$RMSE,2)
accuracy$MAE <- round(accuracy$MAE,2) 
 
accuracy
##          Method  RMSE  MAE
## 1     Baseline1  7.66 3.93
## 2     Baseline2 11.70 7.38
## 3 NB Regression 11.95 6.40
## 4           SVM 11.93 7.74
## 5 Random forest 11.67 7.54
rm(accuracy, 
   RMSE.IQ.baseline1,
   RMSE.IQ.baseline2,
   RMSE.IQ.svm,
   RMSE.IQ.nbr,
   RMSE.IQ.rf,
   MAE.IQ.baseline1,
   MAE.IQ.baseline2,
   MAE.IQ.svm,
   MAE.IQ.nbr,
   MAE.IQ.rf)

Drop training and validation sets

rm(train.iq, train.sj, validation.iq, validation.sj)

CROSS-VALIDATION - Build training and validation sets** ### CV for SJ

library(caret)

set.seed(136)

methods <- c("nnet","rf", "mlp", "rpart", "svmLinear", "svmRadial", "neuralnet")
performetrics <- data.frame()
#trainControl
control <- trainControl(method="repeatedcv", number=10, repeats=3)

for (i in 1:length(methods)){
  #Train the model
  model_sj.cv <- train(total_cases~.,
                       data=sj_train_labels.lastna[3:23],
                       method=methods[i],
                       trControl=control)
  # summarize results
  #print(methods[i])
  #model_sj.cv$results["MAE"]
  #model_sj.cv$results["RMSE"]
  performetrics[i,1] <- methods[i]
  performetrics[i,2] <- min(model_sj.cv$results["MAE"])
  performetrics[i,3] <- min(model_sj.cv$results["RMSE"])  

}
## # weights:  23
## initial  value 3209630.731535 
## final  value 3172060.000000 
## converged
## # weights:  67
## initial  value 3197802.688580 
## final  value 3172060.000000 
## converged
## # weights:  111
## initial  value 3183375.193811 
## final  value 3172060.000000 
## converged
## # weights:  23
## initial  value 3200281.995395 
## iter  10 value 3172239.909532
## iter  20 value 3172067.096499
## iter  20 value 3172067.067843
## iter  20 value 3172067.067843
## final  value 3172067.067843 
## converged
## # weights:  67
## initial  value 3187166.247585 
## iter  10 value 3172108.357417
## iter  20 value 3172064.045624
## final  value 3172063.881035 
## converged
## # weights:  111
## initial  value 3199405.073932 
## iter  10 value 3172404.826908
## iter  20 value 3172063.103337
## final  value 3172062.756619 
## converged
## # weights:  23
## initial  value 3195874.345250 
## iter  10 value 3172093.846509
## iter  20 value 3172060.396243
## final  value 3172060.047947 
## converged
## # weights:  67
## initial  value 3203956.755399 
## iter  10 value 3172229.741711
## iter  20 value 3172061.971888
## final  value 3172060.070071 
## converged
## # weights:  111
## initial  value 3194486.476217 
## iter  10 value 3172168.991766
## iter  20 value 3172061.283520
## final  value 3172060.082382 
## converged
## # weights:  23
## initial  value 3249044.736232 
## final  value 3223549.000000 
## converged
## # weights:  67
## initial  value 3245209.784130 
## final  value 3223549.000000 
## converged
## # weights:  111
## initial  value 3248865.230244 
## final  value 3223549.000000 
## converged
## # weights:  23
## initial  value 3240170.913117 
## iter  10 value 3223559.483260
## final  value 3223555.987962 
## converged
## # weights:  67
## initial  value 3258653.147534 
## iter  10 value 3224044.133102
## iter  20 value 3223566.137988
## iter  30 value 3223553.348355
## final  value 3223553.185035 
## converged
## # weights:  111
## initial  value 3235776.207773 
## iter  10 value 3223926.272917
## iter  20 value 3223552.906327
## final  value 3223551.758444 
## converged
## # weights:  23
## initial  value 3247952.126468 
## iter  10 value 3223583.367202
## iter  20 value 3223549.400897
## final  value 3223549.047231 
## converged
## # weights:  67
## initial  value 3254265.634668 
## iter  10 value 3223695.257227
## iter  20 value 3223550.705577
## final  value 3223549.126293 
## converged
## # weights:  111
## initial  value 3256303.414416 
## final  value 3223557.972553 
## converged
## # weights:  23
## initial  value 3110247.550726 
## final  value 3076043.000000 
## converged
## # weights:  67
## initial  value 3112799.499910 
## final  value 3076043.000000 
## converged
## # weights:  111
## initial  value 3096369.196755 
## final  value 3076043.000000 
## converged
## # weights:  23
## initial  value 3103212.772398 
## iter  10 value 3076089.061521
## final  value 3076049.962419 
## converged
## # weights:  67
## initial  value 3099748.349269 
## iter  10 value 3076049.937682
## iter  20 value 3076047.294181
## final  value 3076046.903708 
## converged
## # weights:  111
## initial  value 3097708.039027 
## iter  10 value 3076423.573483
## iter  20 value 3076046.099232
## final  value 3076045.779487 
## converged
## # weights:  23
## initial  value 3097194.914019 
## iter  10 value 3076073.788010
## iter  20 value 3076043.359675
## final  value 3076043.042846 
## converged
## # weights:  67
## initial  value 3096728.038765 
## iter  10 value 3076080.844093
## iter  20 value 3076043.449343
## final  value 3076043.080566 
## converged
## # weights:  111
## initial  value 3088181.780633 
## iter  10 value 3076124.348723
## iter  20 value 3076043.966954
## final  value 3076043.070526 
## converged
## # weights:  23
## initial  value 3348836.239697 
## final  value 3333492.000000 
## converged
## # weights:  67
## initial  value 3363172.155721 
## final  value 3333492.000000 
## converged
## # weights:  111
## initial  value 3372505.034740 
## final  value 3333492.000000 
## converged
## # weights:  23
## initial  value 3358434.729853 
## iter  10 value 3333553.348634
## iter  20 value 3333499.046939
## iter  20 value 3333499.032860
## iter  20 value 3333499.032135
## final  value 3333499.032135 
## converged
## # weights:  67
## initial  value 3373530.954683 
## iter  10 value 3333505.648196
## final  value 3333497.620753 
## converged
## # weights:  111
## initial  value 3369647.092954 
## iter  10 value 3333900.074205
## iter  20 value 3333495.395439
## final  value 3333495.134774 
## converged
## # weights:  23
## initial  value 3357839.571697 
## iter  10 value 3333529.261629
## iter  20 value 3333492.435832
## final  value 3333492.052390 
## converged
## # weights:  67
## initial  value 3348865.620035 
## iter  10 value 3333559.300594
## iter  20 value 3333492.793479
## final  value 3333492.099765 
## converged
## # weights:  111
## initial  value 3365216.613903 
## iter  10 value 3333693.698045
## iter  20 value 3333494.353151
## final  value 3333492.166220 
## converged
## # weights:  23
## initial  value 3141073.838283 
## final  value 3120988.000000 
## converged
## # weights:  67
## initial  value 3148026.329035 
## final  value 3120988.000000 
## converged
## # weights:  111
## initial  value 3157828.233653 
## final  value 3120988.000000 
## converged
## # weights:  23
## initial  value 3144428.292469 
## iter  10 value 3121157.094096
## final  value 3120994.972953 
## converged
## # weights:  67
## initial  value 3154261.526797 
## iter  10 value 3121039.472453
## iter  20 value 3120992.969465
## iter  20 value 3120992.956099
## iter  20 value 3120992.956099
## final  value 3120992.956099 
## converged
## # weights:  111
## initial  value 3151745.614646 
## iter  10 value 3121061.078223
## iter  20 value 3120991.425254
## final  value 3120991.209412 
## converged
## # weights:  23
## initial  value 3150557.807793 
## iter  10 value 3121023.331498
## iter  20 value 3120988.412241
## final  value 3120988.048652 
## converged
## # weights:  67
## initial  value 3140136.926537 
## iter  10 value 3121100.484013
## iter  20 value 3120989.316104
## final  value 3120988.094238 
## converged
## # weights:  111
## initial  value 3154361.325150 
## iter  10 value 3121153.346111
## iter  20 value 3120989.935360
## final  value 3120988.111752 
## converged
## # weights:  23
## initial  value 3079870.099543 
## final  value 3054256.000000 
## converged
## # weights:  67
## initial  value 3077001.574388 
## final  value 3054256.000000 
## converged
## # weights:  111
## initial  value 3095480.288624 
## final  value 3054256.000000 
## converged
## # weights:  23
## initial  value 3075523.687495 
## iter  10 value 3054265.374457
## final  value 3054262.982702 
## converged
## # weights:  67
## initial  value 3078378.358658 
## iter  10 value 3054290.763960
## iter  20 value 3054260.041464
## iter  20 value 3054260.022610
## final  value 3054259.920833 
## converged
## # weights:  111
## initial  value 3091312.372691 
## iter  10 value 3054470.241965
## final  value 3054259.355974 
## converged
## # weights:  23
## initial  value 3086639.516313 
## iter  10 value 3054392.134231
## iter  20 value 3054257.575409
## final  value 3054256.069525 
## converged
## # weights:  67
## initial  value 3077235.766942 
## iter  10 value 3054311.470741
## iter  20 value 3054256.657345
## final  value 3054256.061881 
## converged
## # weights:  111
## initial  value 3085408.103725 
## final  value 3054258.646258 
## converged
## # weights:  23
## initial  value 3223749.640386 
## final  value 3190550.000000 
## converged
## # weights:  67
## initial  value 3214577.605914 
## final  value 3190550.000000 
## converged
## # weights:  111
## initial  value 3233920.941134 
## final  value 3190550.000000 
## converged
## # weights:  23
## initial  value 3227661.094225 
## iter  10 value 3191737.286454
## iter  20 value 3190564.820660
## final  value 3190556.975841 
## converged
## # weights:  67
## initial  value 3231694.536349 
## iter  10 value 3190598.618556
## iter  20 value 3190555.689146
## final  value 3190553.929168 
## converged
## # weights:  111
## initial  value 3218640.119559 
## iter  10 value 3190751.630465
## iter  20 value 3190553.178515
## final  value 3190552.777677 
## converged
## # weights:  23
## initial  value 3215363.685906 
## iter  10 value 3190584.405900
## iter  20 value 3190550.401525
## final  value 3190550.047462 
## converged
## # weights:  67
## initial  value 3228262.599575 
## iter  10 value 3190648.388881
## iter  20 value 3190551.154530
## final  value 3190550.072029 
## converged
## # weights:  111
## initial  value 3223132.848869 
## iter  10 value 3190590.742391
## iter  20 value 3190550.492512
## final  value 3190550.073378 
## converged
## # weights:  23
## initial  value 3300486.542786 
## final  value 3259705.000000 
## converged
## # weights:  67
## initial  value 3279683.018840 
## final  value 3259705.000000 
## converged
## # weights:  111
## initial  value 3270456.467898 
## final  value 3259705.000000 
## converged
## # weights:  23
## initial  value 3282122.639608 
## iter  10 value 3259720.244123
## final  value 3259717.495917 
## converged
## # weights:  67
## initial  value 3286047.526814 
## iter  10 value 3259857.964682
## iter  20 value 3259709.226134
## final  value 3259708.932108 
## converged
## # weights:  111
## initial  value 3275153.913687 
## iter  10 value 3259873.163928
## iter  20 value 3259708.252198
## iter  20 value 3259708.226259
## iter  20 value 3259708.221202
## final  value 3259708.221202 
## converged
## # weights:  23
## initial  value 3287653.303985 
## iter  10 value 3259778.767736
## iter  20 value 3259705.856807
## final  value 3259705.063864 
## converged
## # weights:  67
## initial  value 3285087.257291 
## iter  10 value 3259822.897615
## iter  20 value 3259706.377833
## final  value 3259705.073785 
## converged
## # weights:  111
## initial  value 3278655.780170 
## final  value 3259705.040746 
## converged
## # weights:  23
## initial  value 3015949.565057 
## final  value 2993143.000000 
## converged
## # weights:  67
## initial  value 3024603.134634 
## final  value 2993143.000000 
## converged
## # weights:  111
## initial  value 3021205.493431 
## final  value 2993143.000000 
## converged
## # weights:  23
## initial  value 3017800.626752 
## iter  10 value 2993240.542391
## iter  20 value 2993150.342436
## final  value 2993149.959774 
## converged
## # weights:  67
## initial  value 3014417.043287 
## iter  10 value 2993169.289412
## iter  20 value 2993148.131758
## final  value 2993146.901342 
## converged
## # weights:  111
## initial  value 3014060.556502 
## iter  10 value 2993155.293509
## iter  20 value 2993146.422220
## final  value 2993145.821403 
## converged
## # weights:  23
## initial  value 3018655.129383 
## iter  10 value 2993210.261010
## iter  20 value 2993143.782043
## final  value 2993143.040717 
## converged
## # weights:  67
## initial  value 3021686.820563 
## iter  10 value 2993222.754251
## iter  20 value 2993143.935431
## final  value 2993143.056474 
## converged
## # weights:  111
## initial  value 3014204.721978 
## iter  10 value 2993263.812354
## iter  20 value 2993144.423934
## final  value 2993143.119489 
## converged
## # weights:  23
## initial  value 3097514.307528 
## final  value 3066593.000000 
## converged
## # weights:  67
## initial  value 3076147.206548 
## final  value 3066593.000000 
## converged
## # weights:  111
## initial  value 3080946.009980 
## final  value 3066593.000000 
## converged
## # weights:  23
## initial  value 3096714.698577 
## iter  10 value 3066611.738406
## final  value 3066600.000760 
## converged
## # weights:  67
## initial  value 3087639.604104 
## iter  10 value 3066971.859218
## iter  20 value 3066598.488990
## final  value 3066598.001794 
## converged
## # weights:  111
## initial  value 3092190.103300 
## iter  10 value 3066607.205304
## iter  20 value 3066597.191252
## iter  30 value 3066595.816477
## final  value 3066595.758265 
## converged
## # weights:  23
## initial  value 3088249.561681 
## iter  10 value 3066655.438059
## iter  20 value 3066593.726959
## final  value 3066593.056598 
## converged
## # weights:  67
## initial  value 3097893.735604 
## iter  10 value 3066681.237707
## iter  20 value 3066594.034313
## final  value 3066593.117766 
## converged
## # weights:  111
## initial  value 3100473.779459 
## iter  10 value 3066721.655984
## iter  20 value 3066594.511691
## final  value 3066593.088710 
## converged
## # weights:  23
## initial  value 3372222.164415 
## final  value 3336258.000000 
## converged
## # weights:  67
## initial  value 3375305.028493 
## final  value 3336258.000000 
## converged
## # weights:  111
## initial  value 3371822.975021 
## final  value 3336258.000000 
## converged
## # weights:  23
## initial  value 3368261.988962 
## iter  10 value 3336267.319449
## final  value 3336264.997942 
## converged
## # weights:  67
## initial  value 3365797.333298 
## iter  10 value 3336368.843175
## final  value 3336262.160113 
## converged
## # weights:  111
## initial  value 3371034.103749 
## iter  10 value 3336435.333935
## iter  20 value 3336260.969075
## iter  20 value 3336260.959066
## final  value 3336260.776710 
## converged
## # weights:  23
## initial  value 3369761.411609 
## iter  10 value 3336327.403963
## iter  20 value 3336258.805793
## final  value 3336258.060625 
## converged
## # weights:  67
## initial  value 3361082.229973 
## final  value 3336258.632746 
## converged
## # weights:  111
## initial  value 3368099.287314 
## iter  10 value 3336403.004804
## iter  20 value 3336259.703144
## final  value 3336258.144614 
## converged
## # weights:  23
## initial  value 3219235.755473 
## final  value 3188931.000000 
## converged
## # weights:  67
## initial  value 3220907.814144 
## final  value 3188931.000000 
## converged
## # weights:  111
## initial  value 3228172.890360 
## final  value 3188931.000000 
## converged
## # weights:  23
## initial  value 3217071.444658 
## iter  10 value 3189200.055396
## iter  20 value 3188939.198368
## final  value 3188938.032983 
## converged
## # weights:  67
## initial  value 3217820.261349 
## iter  10 value 3188945.143215
## iter  20 value 3188935.781547
## final  value 3188935.618098 
## converged
## # weights:  111
## initial  value 3224162.207852 
## iter  10 value 3189301.972761
## iter  20 value 3188934.093573
## final  value 3188933.746702 
## converged
## # weights:  23
## initial  value 3212464.915480 
## iter  10 value 3188998.115057
## iter  20 value 3188931.779179
## final  value 3188931.076347 
## converged
## # weights:  67
## initial  value 3217500.369924 
## iter  10 value 3189009.567477
## iter  20 value 3188931.923061
## final  value 3188931.078609 
## converged
## # weights:  111
## initial  value 3219675.518465 
## iter  10 value 3189009.396443
## iter  20 value 3188931.935607
## final  value 3188931.075546 
## converged
## # weights:  23
## initial  value 3402949.467121 
## final  value 3365946.000000 
## converged
## # weights:  67
## initial  value 3389943.934732 
## final  value 3365946.000000 
## converged
## # weights:  111
## initial  value 3395156.546999 
## final  value 3365946.000000 
## converged
## # weights:  23
## initial  value 3397784.260558 
## iter  10 value 3365956.214150
## final  value 3365952.998832 
## converged
## # weights:  67
## initial  value 3400824.997270 
## iter  10 value 3366174.109824
## iter  20 value 3365950.418703
## final  value 3365949.918008 
## converged
## # weights:  111
## initial  value 3389725.298422 
## iter  10 value 3366287.308587
## iter  20 value 3365950.895317
## final  value 3365948.873529 
## converged
## # weights:  23
## initial  value 3400030.904051 
## iter  10 value 3365982.931188
## iter  20 value 3365946.433185
## final  value 3365946.053153 
## converged
## # weights:  67
## initial  value 3387998.157928 
## iter  10 value 3366016.792199
## iter  20 value 3365946.835039
## final  value 3365946.095885 
## converged
## # weights:  111
## initial  value 3396084.482973 
## final  value 3365946.056456 
## converged
## # weights:  23
## initial  value 2875200.386612 
## final  value 2840876.000000 
## converged
## # weights:  67
## initial  value 2884539.824092 
## final  value 2840876.000000 
## converged
## # weights:  111
## initial  value 2859804.607379 
## final  value 2840876.000000 
## converged
## # weights:  23
## initial  value 2861785.385640 
## iter  10 value 2840905.966764
## final  value 2840882.955714 
## converged
## # weights:  67
## initial  value 2878768.970805 
## iter  10 value 2840922.612440
## iter  20 value 2840881.288641
## final  value 2840880.912545 
## converged
## # weights:  111
## initial  value 2879533.034639 
## iter  10 value 2840908.153646
## iter  20 value 2840884.022118
## final  value 2840881.587182 
## converged
## # weights:  23
## initial  value 2866924.386694 
## iter  10 value 2840909.532004
## iter  20 value 2840876.393786
## final  value 2840876.048736 
## converged
## # weights:  67
## initial  value 2875196.720113 
## iter  10 value 2840936.071543
## iter  20 value 2840876.710843
## final  value 2840876.086918 
## converged
## # weights:  111
## initial  value 2855944.562223 
## iter  10 value 2840964.468249
## iter  20 value 2840877.050177
## final  value 2840876.076135 
## converged
## # weights:  23
## initial  value 3326304.180783 
## final  value 3291403.000000 
## converged
## # weights:  67
## initial  value 3328806.258254 
## final  value 3291403.000000 
## converged
## # weights:  111
## initial  value 3327589.572318 
## final  value 3291403.000000 
## converged
## # weights:  23
## initial  value 3328804.551539 
## iter  10 value 3291417.300037
## final  value 3291410.013852 
## converged
## # weights:  67
## initial  value 3337928.733960 
## iter  10 value 3291413.925808
## final  value 3291410.016077 
## converged
## # weights:  111
## initial  value 3339787.876057 
## iter  10 value 3291454.585761
## iter  20 value 3291417.730654
## iter  30 value 3291407.629054
## iter  40 value 3291405.791689
## iter  40 value 3291405.776020
## iter  40 value 3291405.764281
## final  value 3291405.764281 
## converged
## # weights:  23
## initial  value 3326209.479733 
## iter  10 value 3291469.390093
## iter  20 value 3291403.770720
## final  value 3291403.057910 
## converged
## # weights:  67
## initial  value 3335453.410643 
## iter  10 value 3291472.296860
## iter  20 value 3291403.812956
## final  value 3291403.093157 
## converged
## # weights:  111
## initial  value 3330082.550379 
## iter  10 value 3291600.609291
## iter  20 value 3291405.303193
## final  value 3291403.117416 
## converged
## # weights:  23
## initial  value 3319660.932888 
## final  value 3282104.000000 
## converged
## # weights:  67
## initial  value 3317563.668671 
## final  value 3282104.000000 
## converged
## # weights:  111
## initial  value 3320364.230834 
## final  value 3282104.000000 
## converged
## # weights:  23
## initial  value 3318914.670622 
## iter  10 value 3282621.697996
## iter  20 value 3282117.116367
## final  value 3282110.979144 
## converged
## # weights:  67
## initial  value 3313132.255289 
## iter  10 value 3282127.230086
## iter  20 value 3282110.381065
## final  value 3282107.901824 
## converged
## # weights:  111
## initial  value 3321573.349911 
## iter  10 value 3282168.769065
## iter  20 value 3282107.059728
## iter  20 value 3282107.043387
## final  value 3282106.770804 
## converged
## # weights:  23
## initial  value 3312606.232289 
## iter  10 value 3282139.538538
## iter  20 value 3282104.417149
## final  value 3282104.051449 
## converged
## # weights:  67
## initial  value 3303108.547357 
## iter  10 value 3282196.326271
## iter  20 value 3282105.079972
## final  value 3282104.062438 
## converged
## # weights:  111
## initial  value 3299598.735759 
## final  value 3282104.056011 
## converged
## # weights:  23
## initial  value 3276110.364056 
## final  value 3241576.000000 
## converged
## # weights:  67
## initial  value 3276694.993988 
## final  value 3241576.000000 
## converged
## # weights:  111
## initial  value 3274196.355228 
## final  value 3241576.000000 
## converged
## # weights:  23
## initial  value 3274325.456662 
## iter  10 value 3241588.473112
## iter  20 value 3241583.193449
## final  value 3241583.014296 
## converged
## # weights:  67
## initial  value 3266259.470317 
## iter  10 value 3241583.790451
## final  value 3241579.899445 
## converged
## # weights:  111
## initial  value 3274214.942803 
## iter  10 value 3241944.243640
## iter  20 value 3241579.543018
## final  value 3241578.782761 
## converged
## # weights:  23
## initial  value 3279888.299989 
## iter  10 value 3241630.758710
## iter  20 value 3241576.636109
## final  value 3241576.048186 
## converged
## # weights:  67
## initial  value 3265125.117355 
## iter  10 value 3241731.365952
## iter  20 value 3241577.810049
## final  value 3241576.141604 
## converged
## # weights:  111
## initial  value 3268834.765556 
## iter  10 value 3241628.688400
## iter  20 value 3241576.640716
## final  value 3241576.075252 
## converged
## # weights:  23
## initial  value 2856308.164881 
## final  value 2828007.000000 
## converged
## # weights:  67
## initial  value 2859262.029164 
## final  value 2828007.000000 
## converged
## # weights:  111
## initial  value 2853868.476784 
## final  value 2828007.000000 
## converged
## # weights:  23
## initial  value 2847914.798213 
## iter  10 value 2828082.331540
## iter  20 value 2828014.200742
## final  value 2828013.972055 
## converged
## # weights:  67
## initial  value 2847302.023729 
## iter  10 value 2828026.670250
## final  value 2828010.866284 
## converged
## # weights:  111
## initial  value 2850368.026241 
## iter  10 value 2828177.612752
## iter  20 value 2828015.911441
## iter  30 value 2828010.393183
## final  value 2828010.025798 
## converged
## # weights:  23
## initial  value 2854592.985367 
## iter  10 value 2828066.525005
## iter  20 value 2828007.693438
## final  value 2828007.054356 
## converged
## # weights:  67
## initial  value 2851904.723387 
## iter  10 value 2828087.133793
## iter  20 value 2828007.940377
## final  value 2828007.075117 
## converged
## # weights:  111
## initial  value 2853096.441452 
## iter  10 value 2828081.492447
## iter  20 value 2828007.885195
## final  value 2828007.084623 
## converged
## # weights:  23
## initial  value 2966235.039508 
## final  value 2930948.000000 
## converged
## # weights:  67
## initial  value 2951275.393781 
## final  value 2930948.000000 
## converged
## # weights:  111
## initial  value 2948696.206051 
## final  value 2930948.000000 
## converged
## # weights:  23
## initial  value 2953783.516532 
## iter  10 value 2931094.810595
## final  value 2930954.952779 
## converged
## # weights:  67
## initial  value 2943792.385710 
## iter  10 value 2931168.091348
## iter  20 value 2930952.031346
## final  value 2930951.911070 
## converged
## # weights:  111
## initial  value 2966017.471498 
## iter  10 value 2931114.066788
## iter  20 value 2930951.391641
## final  value 2930950.755428 
## converged
## # weights:  23
## initial  value 2948245.571813 
## iter  10 value 2930996.300416
## iter  20 value 2930948.561482
## final  value 2930948.050875 
## converged
## # weights:  67
## initial  value 2961996.697728 
## iter  10 value 2931072.738043
## iter  20 value 2930949.459193
## final  value 2930948.061684 
## converged
## # weights:  111
## initial  value 2975826.740242 
## iter  10 value 2931023.874138
## iter  20 value 2930948.899991
## final  value 2930948.162525 
## converged
## # weights:  23
## initial  value 3200185.095834 
## final  value 3184330.000000 
## converged
## # weights:  67
## initial  value 3206181.245436 
## final  value 3184330.000000 
## converged
## # weights:  111
## initial  value 3223421.899216 
## final  value 3184330.000000 
## converged
## # weights:  23
## initial  value 3222441.509129 
## iter  10 value 3184340.670116
## final  value 3184337.002946 
## converged
## # weights:  67
## initial  value 3217821.127839 
## iter  10 value 3184528.733266
## iter  20 value 3184334.114894
## final  value 3184333.909782 
## converged
## # weights:  111
## initial  value 3208971.569779 
## iter  10 value 3184555.875730
## iter  20 value 3184363.549698
## iter  30 value 3184333.850437
## final  value 3184332.769218 
## converged
## # weights:  23
## initial  value 3209361.146383 
## iter  10 value 3184379.125503
## iter  20 value 3184330.574214
## final  value 3184330.046800 
## converged
## # weights:  67
## initial  value 3211828.645847 
## iter  10 value 3184424.213141
## iter  20 value 3184331.102153
## final  value 3184330.063824 
## converged
## # weights:  111
## initial  value 3200382.366016 
## iter  10 value 3184443.891626
## iter  20 value 3184331.347777
## final  value 3184330.071907 
## converged
## # weights:  23
## initial  value 3164955.329130 
## final  value 3134954.000000 
## converged
## # weights:  67
## initial  value 3156465.788861 
## final  value 3134954.000000 
## converged
## # weights:  111
## initial  value 3182534.573484 
## final  value 3134954.000000 
## converged
## # weights:  23
## initial  value 3177458.531104 
## iter  10 value 3134984.144214
## iter  20 value 3134961.237630
## final  value 3134960.980755 
## converged
## # weights:  67
## initial  value 3164528.679798 
## iter  10 value 3134971.292874
## final  value 3134957.953488 
## converged
## # weights:  111
## initial  value 3167597.610902 
## iter  10 value 3134973.001492
## iter  20 value 3134957.418505
## iter  20 value 3134957.390890
## final  value 3134957.226191 
## converged
## # weights:  23
## initial  value 3172176.403577 
## iter  10 value 3135011.124067
## iter  20 value 3134954.662453
## final  value 3134954.069033 
## converged
## # weights:  67
## initial  value 3168670.919047 
## iter  10 value 3135032.731438
## iter  20 value 3134954.927634
## final  value 3134954.059982 
## converged
## # weights:  111
## initial  value 3168581.913091 
## iter  10 value 3135113.593496
## iter  20 value 3134955.868479
## final  value 3134954.098624 
## converged
## # weights:  23
## initial  value 3134225.652481 
## final  value 3112880.000000 
## converged
## # weights:  67
## initial  value 3137796.040395 
## final  value 3112880.000000 
## converged
## # weights:  111
## initial  value 3151636.990600 
## final  value 3112880.000000 
## converged
## # weights:  23
## initial  value 3149310.943143 
## iter  10 value 3112898.412613
## final  value 3112887.007261 
## converged
## # weights:  67
## initial  value 3133585.284907 
## iter  10 value 3112891.489925
## final  value 3112885.138366 
## converged
## # weights:  111
## initial  value 3122000.537793 
## iter  10 value 3112890.183519
## iter  20 value 3112884.055507
## iter  30 value 3112882.902151
## final  value 3112882.764730 
## converged
## # weights:  23
## initial  value 3151633.457629 
## iter  10 value 3112973.404137
## iter  20 value 3112881.082000
## final  value 3112880.073337 
## converged
## # weights:  67
## initial  value 3140630.286775 
## iter  10 value 3112936.099231
## iter  20 value 3112880.664908
## final  value 3112880.062698 
## converged
## # weights:  111
## initial  value 3149900.122302 
## iter  10 value 3112979.676869
## iter  20 value 3112881.178881
## final  value 3112880.143596 
## converged
## # weights:  23
## initial  value 3268016.068991 
## final  value 3249187.000000 
## converged
## # weights:  67
## initial  value 3268144.034674 
## final  value 3249187.000000 
## converged
## # weights:  111
## initial  value 3283943.308673 
## final  value 3249187.000000 
## converged
## # weights:  23
## initial  value 3273292.881519 
## iter  10 value 3249199.302988
## final  value 3249194.016795 
## converged
## # weights:  67
## initial  value 3273661.113329 
## iter  10 value 3249251.600098
## iter  20 value 3249195.624578
## iter  30 value 3249191.205841
## final  value 3249190.923814 
## converged
## # weights:  111
## initial  value 3271561.800043 
## iter  10 value 3249238.638861
## iter  20 value 3249201.243493
## iter  30 value 3249190.682363
## iter  40 value 3249189.751984
## iter  40 value 3249189.750619
## iter  40 value 3249189.750377
## final  value 3249189.750377 
## converged
## # weights:  23
## initial  value 3274734.560823 
## iter  10 value 3249256.661323
## iter  20 value 3249187.808362
## final  value 3249187.060427 
## converged
## # weights:  67
## initial  value 3271660.240679 
## iter  10 value 3249219.773227
## iter  20 value 3249187.396483
## final  value 3249187.059334 
## converged
## # weights:  111
## initial  value 3272843.465883 
## final  value 3249187.043611 
## converged
## # weights:  23
## initial  value 3237494.578885 
## final  value 3211375.000000 
## converged
## # weights:  67
## initial  value 3233750.308575 
## final  value 3211375.000000 
## converged
## # weights:  111
## initial  value 3228582.810636 
## final  value 3211375.000000 
## converged
## # weights:  23
## initial  value 3250436.751642 
## iter  10 value 3211427.819101
## final  value 3211381.981478 
## converged
## # weights:  67
## initial  value 3232370.053395 
## iter  10 value 3211389.624619
## iter  20 value 3211380.226152
## final  value 3211378.900563 
## converged
## # weights:  111
## initial  value 3241113.516059 
## iter  10 value 3211736.616027
## iter  20 value 3211378.091771
## final  value 3211377.889768 
## converged
## # weights:  23
## initial  value 3241087.125639 
## iter  10 value 3211436.322071
## iter  20 value 3211375.714329
## final  value 3211375.055953 
## converged
## # weights:  67
## initial  value 3238068.774407 
## iter  10 value 3211482.363457
## iter  20 value 3211376.253021
## final  value 3211375.072160 
## converged
## # weights:  111
## initial  value 3233423.623787 
## iter  10 value 3211440.789762
## iter  20 value 3211375.786664
## final  value 3211375.080532 
## converged
## # weights:  23
## initial  value 3281544.508196 
## final  value 3259024.000000 
## converged
## # weights:  67
## initial  value 3285624.995426 
## final  value 3259024.000000 
## converged
## # weights:  111
## initial  value 3290295.338066 
## final  value 3259024.000000 
## converged
## # weights:  23
## initial  value 3292477.168145 
## iter  10 value 3259759.018096
## iter  20 value 3259037.644888
## final  value 3259036.491096 
## converged
## # weights:  67
## initial  value 3292437.571879 
## iter  10 value 3259229.294154
## iter  20 value 3259027.942876
## iter  20 value 3259027.940576
## iter  20 value 3259027.928763
## final  value 3259027.928763 
## converged
## # weights:  111
## initial  value 3274705.295826 
## iter  10 value 3259395.135778
## iter  20 value 3259027.282145
## final  value 3259026.796791 
## converged
## # weights:  23
## initial  value 3296859.733007 
## iter  10 value 3259733.733578
## iter  20 value 3259032.182671
## iter  30 value 3259024.094340
## final  value 3259024.038622 
## converged
## # weights:  67
## initial  value 3290490.606168 
## iter  10 value 3259138.701730
## iter  20 value 3259025.339810
## final  value 3259024.106921 
## converged
## # weights:  111
## initial  value 3295055.361910 
## final  value 3259024.145614 
## converged
## # weights:  23
## initial  value 2982956.316083 
## final  value 2945856.000000 
## converged
## # weights:  67
## initial  value 2963508.537360 
## final  value 2945856.000000 
## converged
## # weights:  111
## initial  value 2959916.723746 
## final  value 2945856.000000 
## converged
## # weights:  23
## initial  value 2966917.912006 
## iter  10 value 2945964.643062
## iter  20 value 2945868.465562
## final  value 2945863.010430 
## converged
## # weights:  67
## initial  value 2968132.172888 
## iter  10 value 2945864.447163
## final  value 2945861.029930 
## converged
## # weights:  111
## initial  value 2978581.916588 
## iter  10 value 2946169.075382
## iter  20 value 2945859.504909
## final  value 2945859.224339 
## converged
## # weights:  23
## initial  value 2961181.613540 
## iter  10 value 2945907.405535
## iter  20 value 2945856.600011
## final  value 2945856.048112 
## converged
## # weights:  67
## initial  value 2960933.284449 
## iter  10 value 2945946.879058
## iter  20 value 2945857.063564
## final  value 2945856.061984 
## converged
## # weights:  111
## initial  value 2983978.621513 
## iter  10 value 2945995.821764
## iter  20 value 2945857.639569
## final  value 2945856.129807 
## converged
## # weights:  23
## initial  value 3036469.812885 
## final  value 3007919.000000 
## converged
## # weights:  67
## initial  value 3038778.889324 
## final  value 3007919.000000 
## converged
## # weights:  111
## initial  value 3022428.073257 
## final  value 3007919.000000 
## converged
## # weights:  23
## initial  value 3024446.681053 
## iter  10 value 3007926.098056
## final  value 3007925.977955 
## converged
## # weights:  67
## initial  value 3036629.200722 
## iter  10 value 3007934.608218
## iter  20 value 3007923.068333
## final  value 3007922.872384 
## converged
## # weights:  111
## initial  value 3044947.988714 
## iter  10 value 3008280.597137
## iter  20 value 3007924.990708
## final  value 3007922.250018 
## converged
## # weights:  23
## initial  value 3042309.681169 
## iter  10 value 3008059.885393
## iter  20 value 3007920.628631
## final  value 3007919.070177 
## converged
## # weights:  67
## initial  value 3039082.750242 
## iter  10 value 3008034.065790
## iter  20 value 3007920.343778
## final  value 3007919.148520 
## converged
## # weights:  111
## initial  value 3022537.574001 
## final  value 3007919.049377 
## converged
## # weights:  23
## initial  value 3192074.739526 
## final  value 3154163.000000 
## converged
## # weights:  67
## initial  value 3190902.249499 
## final  value 3154163.000000 
## converged
## # weights:  111
## initial  value 3197053.490962 
## final  value 3154163.000000 
## converged
## # weights:  23
## initial  value 3179867.605178 
## iter  10 value 3154396.912467
## final  value 3154169.992770 
## converged
## # weights:  67
## initial  value 3175717.094462 
## iter  10 value 3154189.096683
## iter  20 value 3154167.080181
## iter  20 value 3154167.059679
## iter  20 value 3154167.059679
## final  value 3154167.059679 
## converged
## # weights:  111
## initial  value 3178217.418182 
## iter  10 value 3154331.007034
## final  value 3154173.183228 
## converged
## # weights:  23
## initial  value 3191526.313992 
## iter  10 value 3154218.156781
## iter  20 value 3154163.640396
## final  value 3154163.067419 
## converged
## # weights:  67
## initial  value 3180218.528679 
## iter  10 value 3154234.263703
## iter  20 value 3154163.839725
## final  value 3154163.104268 
## converged
## # weights:  111
## initial  value 3177797.481622 
## iter  10 value 3154254.744368
## iter  20 value 3154164.085722
## final  value 3154163.099708 
## converged
## # weights:  23
## initial  value 3204415.542651 
## final  value 3177835.000000 
## converged
## # weights:  67
## initial  value 3194059.096293 
## final  value 3177835.000000 
## converged
## # weights:  111
## initial  value 3196395.227032 
## final  value 3177835.000000 
## converged
## # weights:  23
## initial  value 3213441.361630 
## iter  10 value 3177854.501907
## final  value 3177841.978359 
## converged
## # weights:  67
## initial  value 3201567.874644 
## iter  10 value 3178310.281592
## iter  20 value 3177839.950674
## final  value 3177838.967177 
## converged
## # weights:  111
## initial  value 3201568.060069 
## iter  10 value 3177846.415203
## iter  20 value 3177842.981113
## iter  30 value 3177838.310169
## final  value 3177837.781601 
## converged
## # weights:  23
## initial  value 3202101.487795 
## iter  10 value 3177868.999589
## iter  20 value 3177835.396234
## final  value 3177835.046348 
## converged
## # weights:  67
## initial  value 3208565.273266 
## iter  10 value 3177939.217128
## iter  20 value 3177836.215643
## final  value 3177835.090279 
## converged
## # weights:  111
## initial  value 3205590.078696 
## iter  10 value 3177970.012346
## iter  20 value 3177836.585779
## final  value 3177835.073211 
## converged
## # weights:  23
## initial  value 3271921.107095 
## final  value 3237186.000000 
## converged
## # weights:  67
## initial  value 3258223.970317 
## final  value 3237186.000000 
## converged
## # weights:  111
## initial  value 3256361.494328 
## final  value 3237186.000000 
## converged
## # weights:  23
## initial  value 3275658.365772 
## iter  10 value 3237325.664099
## iter  20 value 3237193.338395
## final  value 3237193.140514 
## converged
## # weights:  67
## initial  value 3271542.985178 
## iter  10 value 3237417.102501
## iter  20 value 3237190.742023
## iter  20 value 3237190.710725
## final  value 3237189.937074 
## converged
## # weights:  111
## initial  value 3269548.826162 
## iter  10 value 3237193.700319
## iter  20 value 3237189.362138
## final  value 3237188.774944 
## converged
## # weights:  23
## initial  value 3269597.909854 
## iter  10 value 3237225.019547
## iter  20 value 3237186.455448
## final  value 3237186.053908 
## converged
## # weights:  67
## initial  value 3248593.093609 
## iter  10 value 3237227.028428
## iter  20 value 3237186.488186
## final  value 3237186.066047 
## converged
## # weights:  111
## initial  value 3255889.199338 
## final  value 3237186.051690 
## converged
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## # weights:  23
## initial  value 3539189.970922 
## final  value 3498931.000000 
## converged
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold01.Rep1: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold02.Rep1: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold03.Rep1: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold04.Rep1: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold05.Rep1: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold06.Rep1: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold08.Rep1: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold09.Rep1: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold09.Rep1: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold02.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold03.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold04.Rep2: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold05.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold06.Rep2: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold06.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold07.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold10.Rep2: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold10.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold03.Rep3: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold04.Rep3: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold08.Rep3: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold09.Rep3: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
colnames(performetrics)[1]<- "Method"
colnames(performetrics)[2]<- "MAE"
colnames(performetrics)[3]<- "RMSE"

View(performetrics)
rm(i, control, methods, model_sj.cv, performetrics)

TIME SERIES ANALYSIS

ts_sj <- ts(sj_train_labels.lastna$total_cases, start = c(min(sj_train_labels.lastna$year),min(sj_train_labels.lastna$weekofyear[sj_train_labels.lastna$year == min(sj_train_labels.lastna$year)])), frequency = 52)

plot((ts_sj) , main = 'SJ: Total_cases')

plot(decompose(ts_sj))

Holt-Winters filtering

fit1 <- HoltWinters(ts_sj)
fit2<- HoltWinters(ts_sj, beta = FALSE, gamma = FALSE)
par(mfrow=c(2,1))
plot(fit1)
plot(fit2)